From bd7fb9fe0264400720bd7c8612900a41198cb508 Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Thu, 28 May 2026 07:16:45 -0700 Subject: [PATCH 1/7] [Cadence: Vision] ResNet18 & ResNet50: Optimized, DMA-enabled, functional - Add DMA-optimized operators: conv2d (1x1/3x3/7x7), maxpool, quantize/dequantize, relu, add, mean, softmax, linear - Add new operators: embedding, full, im2row, quantized_fully_connected, quantized_layer_norm, quantized_matmul, requantize, view_copy - Add vision/kernels library and quantized_ops.h header - Add config generator for DMA buffer sizing - Update functions_vision.yaml and CMakeLists.txt - Add third-party XAI libraries (libxai, libxai_common, libxa_nnlib) - FACTO submodule update --- .gitignore | 6 +- CMakeLists.txt | 18 +- backends/cadence/CMakeLists.txt | 4 + backends/cadence/aot/functions_vision.yaml | 18 +- backends/cadence/utils/FACTO | 2 +- backends/cadence/{ => utils}/runtime/BUCK | 0 .../cadence/{ => utils}/runtime/__init__.py | 0 .../cadence/{ => utils}/runtime/et_pal.cpp | 0 .../cadence/{ => utils}/runtime/etdump.py | 0 .../cadence/{ => utils}/runtime/executor.py | 0 .../{ => utils}/runtime/executor_main.sh | 0 .../cadence/{ => utils}/runtime/runtime.py | 0 .../cadence/{ => utils}/runtime/targets.bzl | 0 backends/cadence/{ => utils}/runtime/utils.py | 0 .../cadence/vision/config_generator/README.md | 107 + .../generate_combined_configs.py | 901 + .../config_generator/generate_idma_buffers.py | 1478 + .../generate_layer_configs.py | 1158 + .../config_generator/layer_configs_16k.h | 2403 ++ .../config_generator/layer_configs_24k.h | 2403 ++ .../config_generator/layer_configs_32k.h | 2403 ++ .../config_generator/layer_configs_4k.h | 2403 ++ .../config_generator/layer_configs_61k.h | 2403 ++ .../config_generator/layer_configs_8k.h | 2403 ++ .../config_generator/layer_configs_cache.h | 2403 ++ .../cadence/vision/kernels/CMakeLists.txt | 4 +- .../cadence/vision/operators/CMakeLists.txt | 68 +- backends/cadence/vision/operators/TARGETS | 5 + .../vision/operators/conv/conv_exec_1x1j1d1.c | 1023 + .../vision/operators/conv/conv_exec_1x1j2d1.c | 1132 + .../vision/operators/conv/conv_exec_3x3j1d1.c | 1030 + .../vision/operators/conv/conv_exec_3x3j2d1.c | 1028 + .../vision/operators/conv/conv_exec_7x7j2d1.c | 1088 + .../operators/conv/conv_kernel_dispatcher.c | 50 + .../vision/operators/conv/kernel_executors.h | 137 + .../cadence/vision/operators/layer_configs.h | 2403 ++ .../operators/maxpool/maxpool_exec_mxnj2.c | 352 + .../operators/maxpool/maxpool_executors.h | 61 + .../vision/operators/mean/mean_exec_dma.c | 149 + .../vision/operators/mean/mean_executors.h | 51 + backends/cadence/vision/operators/op_add.cpp | 338 +- .../operators/op_dequantize_per_tensor.cpp | 293 +- .../operators/op_max_pool2d_with_indices.cpp | 165 + backends/cadence/vision/operators/op_mean.cpp | 183 + .../operators/op_quantize_per_tensor.cpp | 396 +- .../operators/op_quantized_conv_out.cpp | 398 +- .../operators/op_quantized_linear_out.cpp | 266 +- .../operators/op_quantized_relu_out.cpp | 370 +- .../cadence/vision/operators/op_softmax.cpp | 261 +- .../cadence/vision/third-party/CMakeLists.txt | 101 + backends/cadence/vision/third-party/dummy.c | 17 - .../cadence/vision/third-party/include/api.h | 65 +- .../cadence/vision/third-party/include/dma.h | 42 + .../vision/third-party/include/dtypes.h | 43 +- .../vision/third-party/include/dump_tensor.h | 70 + .../cadence/vision/third-party/include/lib.h | 72 + .../third-party/include/memory_manager.h | 69 + .../vision/third-party/include/utils.h | 182 + .../third-party/include_private/common.h | 34 +- .../third-party/include_private/idma_init.h | 36 - .../third-party/library/api/dequantize.c | 81 + .../third-party/library/api/maxpool2df.c | 248 + .../vision/third-party/library/api/mean.c | 110 + .../third-party/library/api/quanitze_relu.c | 112 + .../third-party/library/api/quantizef.c | 79 + .../vision/third-party/library/api/vaddf.c | 124 + .../third-party/library/api/vdot_zeropt.c | 123 + .../third-party/library/api/vsoftmaxf.c | 58 +- .../cadence/vision/third-party/library/dma.c | 62 + .../third-party/library/memory_manager.c | 44 + .../third-party/library/tables/expf_tbl.c | 23 +- .../third-party/library/tables/inff_tbl.c | 2 +- .../third-party/library/tables/nanf_tbl.c | 2 +- .../vision/third-party/library/utils.c | 26 + .../third-party/libxai/cnn/src/cnn_conv.c | 1668 + .../third-party/libxai/cnn/src/cnn_conv_MOD.c | 510 + .../third-party/libxai/cnn/src/cnn_conv_MOW.c | 25 + .../third-party/libxai/cnn/src/cnn_conv_MOW.h | 738 + .../third-party/libxai/cnn/src/cnn_conv_SO.c | 27 + .../third-party/libxai/cnn/src/cnn_conv_SO.h | 110 + .../third-party/libxai/cnn/src/cnn_conv_VQ.c | 1371 + .../cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h | 421 + .../cnn/src/cnn_dataConversion3D_I16I8.h | 367 + .../cnn/src/cnn_dataConversion3D_I8I32.h | 414 + .../cnn/src/cnn_dataConversion3D_S32IX.h | 307 + .../libxai/cnn/src/cnn_datatransform.c | 7835 +++++ .../libxai/cnn/src/cnn_dilated_conv_MOD.c | 24 + .../libxai/cnn/src/cnn_dilated_conv_MOD.h | 16078 +++++++++ .../libxai/cnn/src/cnn_dilated_conv_MOD_S16.c | 25 + .../libxai/cnn/src/cnn_dilated_conv_MOD_S16.h | 708 + .../libxai/cnn/src/cnn_dilated_conv_MOW.c | 30 + .../libxai/cnn/src/cnn_dilated_conv_MOW.h | 27240 ++++++++++++++++ .../libxai/cnn/src/cnn_dilated_conv_MOW_S16.c | 23 + .../libxai/cnn/src/cnn_dilated_conv_MOW_S16.h | 2948 ++ .../libxai/cnn/src/cnn_dilated_conv_SO.c | 27 + .../libxai/cnn/src/cnn_dilated_conv_SO.h | 1027 + .../libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c | 25 + .../cnn/src/cnn_dilated_conv_VQ_MOD_S16.c | 25 + .../libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c | 30 + .../cnn/src/cnn_dilated_conv_VQ_MOW_S16.c | 23 + .../libxai/cnn/src/cnn_dilated_conv_VQ_SO.c | 30 + .../cnn/src/cnn_dilated_conv_VQ_partial_MOD.c | 22 + .../src/cnn_dilated_conv_VQ_partial_MOD_S16.c | 22 + .../cnn/src/cnn_dilated_conv_partial_MOD.c | 22 + .../cnn/src/cnn_dilated_conv_partial_MOD.h | 7858 +++++ .../src/cnn_dilated_conv_partial_MOD_S16.c | 22 + .../src/cnn_dilated_conv_partial_MOD_S16.h | 878 + .../libxai/cnn/src/cnn_extend_edge.h | 1517 + .../libxai/cnn/src/cnn_fill_tile.h | 309 + .../third-party/libxai/cnn/src/cnn_helper.c | 2141 ++ .../third-party/libxai/include/xai_cnn.h | 267 + .../third-party/libxai/include/xai_cnn_api.h | 7041 ++++ .../third-party/libxai/include/xai_intrin.h | 1077 + .../include/xai_cnn_api_common.h | 457 + .../include/xai_cnn_api_params.h | 1886 ++ .../libxai_common/include/xai_cnn_common.h | 4329 +++ .../libxai_common/include/xai_cnn_version.h | 74 + .../libxai_common/include/xai_config_api.h | 127 + .../libxai_common/include/xai_core.h | 624 + .../libxai_common/include/xai_core_api.h | 272 + .../libxai_common/include/xai_tile_manager.h | 1246 + .../third-party/libxai_common/src/cnn_cast.c | 1622 + .../third-party/libxai_common/src/cnn_cast.h | 1890 ++ .../libxai_common/src/cnn_cast_scalar.h | 308 + .../libxai_common/src/cnn_eltwise_add.c | 111 + .../libxai_common/src/cnn_eltwise_add.h | 224 + .../libxai_common/src/cnn_eltwise_and.c | 86 + .../libxai_common/src/cnn_eltwise_and.h | 202 + .../libxai_common/src/cnn_eltwise_equal.c | 112 + .../libxai_common/src/cnn_eltwise_equal.h | 244 + .../src/cnn_eltwise_greaterthan.c | 113 + .../src/cnn_eltwise_greaterthan.h | 244 + .../libxai_common/src/cnn_eltwise_lessthan.c | 103 + .../libxai_common/src/cnn_eltwise_lessthan.h | 244 + .../libxai_common/src/cnn_eltwise_max.c | 113 + .../libxai_common/src/cnn_eltwise_max.h | 222 + .../libxai_common/src/cnn_eltwise_min.c | 113 + .../libxai_common/src/cnn_eltwise_min.h | 222 + .../libxai_common/src/cnn_eltwise_mul_S32.c | 570 + .../libxai_common/src/cnn_eltwise_or.c | 87 + .../libxai_common/src/cnn_eltwise_or.h | 202 + .../libxai_common/src/cnn_eltwise_sub.c | 112 + .../libxai_common/src/cnn_eltwise_sub.h | 224 + .../libxai_common/src/cnn_eltwise_xor.c | 87 + .../libxai_common/src/cnn_eltwise_xor.h | 202 + .../libxai_common/src/xai_buildinfo.c | 57 + .../libxai_common/src/xai_errstr.c | 55 + 147 files changed, 133082 insertions(+), 523 deletions(-) rename backends/cadence/{ => utils}/runtime/BUCK (100%) rename backends/cadence/{ => utils}/runtime/__init__.py (100%) rename backends/cadence/{ => utils}/runtime/et_pal.cpp (100%) rename backends/cadence/{ => utils}/runtime/etdump.py (100%) rename backends/cadence/{ => utils}/runtime/executor.py (100%) rename backends/cadence/{ => utils}/runtime/executor_main.sh (100%) rename backends/cadence/{ => utils}/runtime/runtime.py (100%) rename backends/cadence/{ => utils}/runtime/targets.bzl (100%) rename backends/cadence/{ => utils}/runtime/utils.py (100%) create mode 100644 backends/cadence/vision/config_generator/README.md create mode 100644 backends/cadence/vision/config_generator/generate_combined_configs.py create mode 100644 backends/cadence/vision/config_generator/generate_idma_buffers.py create mode 100644 backends/cadence/vision/config_generator/generate_layer_configs.py create mode 100644 backends/cadence/vision/config_generator/layer_configs_16k.h create mode 100644 backends/cadence/vision/config_generator/layer_configs_24k.h create mode 100644 backends/cadence/vision/config_generator/layer_configs_32k.h create mode 100644 backends/cadence/vision/config_generator/layer_configs_4k.h create mode 100644 backends/cadence/vision/config_generator/layer_configs_61k.h create mode 100644 backends/cadence/vision/config_generator/layer_configs_8k.h create mode 100644 backends/cadence/vision/config_generator/layer_configs_cache.h create mode 100644 backends/cadence/vision/operators/TARGETS create mode 100644 backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c create mode 100644 backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c create mode 100644 backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c create mode 100644 backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c create mode 100644 backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c create mode 100644 backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c create mode 100644 backends/cadence/vision/operators/conv/kernel_executors.h create mode 100644 backends/cadence/vision/operators/layer_configs.h create mode 100644 backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c create mode 100644 backends/cadence/vision/operators/maxpool/maxpool_executors.h create mode 100644 backends/cadence/vision/operators/mean/mean_exec_dma.c create mode 100644 backends/cadence/vision/operators/mean/mean_executors.h create mode 100644 backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp create mode 100644 backends/cadence/vision/operators/op_mean.cpp create mode 100644 backends/cadence/vision/third-party/CMakeLists.txt delete mode 100644 backends/cadence/vision/third-party/dummy.c create mode 100644 backends/cadence/vision/third-party/include/dma.h create mode 100644 backends/cadence/vision/third-party/include/dump_tensor.h create mode 100644 backends/cadence/vision/third-party/include/lib.h create mode 100644 backends/cadence/vision/third-party/include/memory_manager.h create mode 100644 backends/cadence/vision/third-party/include/utils.h delete mode 100644 backends/cadence/vision/third-party/include_private/idma_init.h create mode 100644 backends/cadence/vision/third-party/library/api/dequantize.c create mode 100644 backends/cadence/vision/third-party/library/api/maxpool2df.c create mode 100644 backends/cadence/vision/third-party/library/api/mean.c create mode 100644 backends/cadence/vision/third-party/library/api/quanitze_relu.c create mode 100644 backends/cadence/vision/third-party/library/api/quantizef.c create mode 100644 backends/cadence/vision/third-party/library/api/vaddf.c create mode 100644 backends/cadence/vision/third-party/library/api/vdot_zeropt.c create mode 100644 backends/cadence/vision/third-party/library/dma.c create mode 100644 backends/cadence/vision/third-party/library/memory_manager.c create mode 100644 backends/cadence/vision/third-party/library/utils.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h create mode 100644 backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c create mode 100644 backends/cadence/vision/third-party/libxai/include/xai_cnn.h create mode 100644 backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h create mode 100644 backends/cadence/vision/third-party/libxai/include/xai_intrin.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_core.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h create mode 100644 backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h create mode 100644 backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c create mode 100644 backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c diff --git a/.gitignore b/.gitignore index 02dcea02026..1f488009a29 100644 --- a/.gitignore +++ b/.gitignore @@ -32,7 +32,6 @@ build-profiling/ *.model *.etdump tokenizer.json -*.pte *.ptd !test_bpe_tokenizer.bin !test_tiktoken_tokenizer.model @@ -69,6 +68,11 @@ xcuserdata/ /src/executorch/share/ /src/executorch/version.py *_etdump +/runtime/core/portable_type/c10/CMakeFiles/ +/runtime/core/portable_type/c10/bin/ +/runtime/core/portable_type/c10/Makefile +/runtime/core/portable_type/c10/cmake_install.cmake +/runtime/core/portable_type/c10/*.a # Android *.aar diff --git a/CMakeLists.txt b/CMakeLists.txt index 6467e21706e..b7d038e131a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,13 +241,21 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s") endif() -if(EXECUTORCH_OPTIMIZE_SIZE) - # -Os: Optimize for size. - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Os") +if(NOT EXECUTORCH_BUILD_CADENCE) + if(OPTIMIZE_SIZE) + # -Os: Optimize for size + set(CMAKE_CXX_FLAGS_RELEASE "-Os ${CMAKE_CXX_FLAGS_RELEASE}") + else() + # -O2: Moderate opt. + set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}") + endif() else() - # -O2: Moderate opt. - set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}") + set(CMAKE_CXX_FLAGS_RELEASE + "-O3 -mcoproc -mlongcalls -LNO:simd -ffunction-sections -fsigned-char -fno-exceptions -INLINE:requested -fno-zero-initialized-in-bss -mtext-section-literals -fmessage-length=0") + set(CMAKE_C_FLAGS_RELEASE + "-O3 -mcoproc -mlongcalls -LNO:simd -ffunction-sections -fsigned-char -fno-exceptions -INLINE:requested -fno-zero-initialized-in-bss -mtext-section-literals -fmessage-length=0") endif() +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") if(EXECUTORCH_BUILD_TESTS) include(CTest) diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index 271b4806614..4ae621cfe91 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -90,6 +90,10 @@ elseif(EXECUTORCH_FUSION_G3_OPT) ) elseif(EXECUTORCH_VISION_OPT) set(TARGET_DIR vision) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party + ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 + ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) else() set(TARGET_DIR generic) diff --git a/backends/cadence/aot/functions_vision.yaml b/backends/cadence/aot/functions_vision.yaml index cae1e0dc415..f2969a3e6d4 100644 --- a/backends/cadence/aot/functions_vision.yaml +++ b/backends/cadence/aot/functions_vision.yaml @@ -85,12 +85,12 @@ - op: max_pool2d_with_indices.out kernels: - arg_meta: null - kernel_name: torch::executor::max_pool2d_with_indices_out + kernel_name: impl::vision::max_pool2d_with_indices_out - op: mean.out kernels: - arg_meta: null - kernel_name: torch::executor::mean_dim_out + kernel_name: impl::vision::mean_dim_out - op: mul.out kernels: @@ -205,6 +205,16 @@ - arg_meta: null kernel_name: impl::vision::quantized_conv2d_nhwc_out +- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::quantized_conv2d_nchw_per_tensor_out + +- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::vision::quantized_conv2d_nhwc_per_tensor_out + - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -249,10 +259,6 @@ - arg_meta: null kernel_name: impl::vision::im2row_per_tensor_out -- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::vision::quantized_conv_per_tensor_out - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) kernels: diff --git a/backends/cadence/utils/FACTO b/backends/cadence/utils/FACTO index 3b8c778c997..1db37fc79d0 160000 --- a/backends/cadence/utils/FACTO +++ b/backends/cadence/utils/FACTO @@ -1 +1 @@ -Subproject commit 3b8c778c99766a8b4d0d04563ae0b16cbb276829 +Subproject commit 1db37fc79d0d59638cbb794fa49d878aafc24461 diff --git a/backends/cadence/runtime/BUCK b/backends/cadence/utils/runtime/BUCK similarity index 100% rename from backends/cadence/runtime/BUCK rename to backends/cadence/utils/runtime/BUCK diff --git a/backends/cadence/runtime/__init__.py b/backends/cadence/utils/runtime/__init__.py similarity index 100% rename from backends/cadence/runtime/__init__.py rename to backends/cadence/utils/runtime/__init__.py diff --git a/backends/cadence/runtime/et_pal.cpp b/backends/cadence/utils/runtime/et_pal.cpp similarity index 100% rename from backends/cadence/runtime/et_pal.cpp rename to backends/cadence/utils/runtime/et_pal.cpp diff --git a/backends/cadence/runtime/etdump.py b/backends/cadence/utils/runtime/etdump.py similarity index 100% rename from backends/cadence/runtime/etdump.py rename to backends/cadence/utils/runtime/etdump.py diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/utils/runtime/executor.py similarity index 100% rename from backends/cadence/runtime/executor.py rename to backends/cadence/utils/runtime/executor.py diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/utils/runtime/executor_main.sh similarity index 100% rename from backends/cadence/runtime/executor_main.sh rename to backends/cadence/utils/runtime/executor_main.sh diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/utils/runtime/runtime.py similarity index 100% rename from backends/cadence/runtime/runtime.py rename to backends/cadence/utils/runtime/runtime.py diff --git a/backends/cadence/runtime/targets.bzl b/backends/cadence/utils/runtime/targets.bzl similarity index 100% rename from backends/cadence/runtime/targets.bzl rename to backends/cadence/utils/runtime/targets.bzl diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/utils/runtime/utils.py similarity index 100% rename from backends/cadence/runtime/utils.py rename to backends/cadence/utils/runtime/utils.py diff --git a/backends/cadence/vision/config_generator/README.md b/backends/cadence/vision/config_generator/README.md new file mode 100644 index 00000000000..c55fadcb584 --- /dev/null +++ b/backends/cadence/vision/config_generator/README.md @@ -0,0 +1,107 @@ +# Config Generator Python + +Python tools for extracting convolution layer parameters from neural network models and generating optimized C header configurations for DMA-tiled execution on the Xtensa XRC Vision DSP (XAI CNN runtime). + +## Prerequisites + +The script requires the Python venv in the executorch tree and must be run from a **bash** terminal (not csh): + +```bash +# The venv is at /.venv/ +# All paths below are relative to the executorch root. + +# Option 1: call the venv python directly (works from any shell) +.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py ... + +# Option 2: activate the venv in a bash shell +bash +source .venv/bin/activate +python3 backends/cadence/vision/config_generator/generate_layer_configs.py ... +``` + +> **Note:** The default terminal on this machine is `csh`. Inline python commands +> and `source ... && ...` chains will fail in csh. Always use `bash` or invoke +> the venv python by its full path. + +## Quick Start + +```bash +# Run from the executorch root directory: cd + +# From a single ExecuTorch .pte binary +.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \ + --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \ + --output backends/cadence/vision/config_generator/conv_layer_configs.h \ + --dram0 62976 --dram1 62976 + +# From multiple .pte files (layers are deduplicated automatically) +.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \ + --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \ + operator_and_model_testing/resnet50/pte/resnet50_quantized.pte \ + --output backends/cadence/vision/config_generator/conv_layer_configs_combined.h \ + --dram0 62976 --dram1 62976 + +# From a torchvision model (requires torchvision installed in venv) +.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \ + --model resnet18 --input-size 1,3,64,64 \ + --output backends/cadence/vision/config_generator/conv_layer_configs.h \ + --dram0 32768 --dram1 32768 +``` + +### Full working commands + +```bash +# cd to the executorch root first +cd + +# ResNet18 with 62976 bytes per DRAM bank +.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \ + --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \ + --output backends/cadence/vision/config_generator/conv_layer_configs_62k_pte.h \ + --dram0 62976 --dram1 62976 + +# ResNet18 + ResNet50 combined +.venv/bin/python3 backends/cadence/vision/config_generator/generate_layer_configs.py \ + --pte operator_and_model_testing/resnet18/pte/resnet18_quantized.pte \ + operator_and_model_testing/resnet50/pte/resnet50_quantized.pte \ + --output backends/cadence/vision/config_generator/conv_layer_configs_62k_combined.h \ + --dram0 62976 --dram1 62976 +``` + +--- + +## `generate_layer_configs.py` — Arguments + +| Flag | Default | Description | +|------|---------|-------------| +| `--model`, `-m` | — | Comma or `+`-separated torchvision model names (e.g. `resnet18+resnet50`) | +| `--pte` | — | Path to an ExecuTorch `.pte` binary; bootstraps `exir._serialize` from the local source tree — no pip install needed | +| `--flatc` | cmake-out default | Path to `flatc` binary (auto-detected; only relevant with `--pte`) | +| `--input-size` | `1,3,64,64` | Input tensor shape `N,C,H,W` (only used with `--model`) | +| `--output`, `-o` | `conv_layer_configs.h` | Output C header file | +| `--dram0` | `32768` | DRAM0 size in bytes | +| `--dram1` | `32768` | DRAM1 size in bytes | +| `--cache-mode` | off | Append `_cache` to every kernel name | + +--- + +## Output + +The generated header contains: + +- `conv_layer_config_t` struct with ~60 fields (buffer sizes, tile dimensions, DRAM0/1 placement, kernel name, quantization params) +- `CONV_LAYER_CONFIGS[]` static array — one entry per unique layer +- `get_layer_config()`, `get_layer_config_by_params()`, `get_layer_config_by_key()` inline accessors + +--- + +## Directory Structure + +``` +config_generator_python/ +├── generate_layer_configs.py # Main entry point +├── generate_idma_buffers.py # Core tiling / buffer sizing engine +├── extract_layers_from_pte.py # .pte/.onnx → JSON (intermediate step) +├── config/ # Pre-generated headers +└── bin/ # Compare / test utilities +``` diff --git a/backends/cadence/vision/config_generator/generate_combined_configs.py b/backends/cadence/vision/config_generator/generate_combined_configs.py new file mode 100644 index 00000000000..4d5ca037a65 --- /dev/null +++ b/backends/cadence/vision/config_generator/generate_combined_configs.py @@ -0,0 +1,901 @@ +#!/usr/bin/env python3 +""" +Generate combined conv2d + maxpool DMA buffer configuration header from PTE files. + +Extracts both conv2d and maxpool layers from ExecuTorch .pte binaries and +generates a single C header with both configuration tables and accessors. + +Usage: + # Single PTE + python generate_combined_configs.py \\ + --pte resnet18_quantized.pte \\ + --output layer_configs.h --dram0 62976 --dram1 62976 + + # Multiple PTE files (deduplicates automatically) + python generate_combined_configs.py \\ + --pte resnet18_quantized.pte resnet50_quantized.pte \\ + --output layer_configs.h --dram0 62976 --dram1 62976 + + # Force all conv kernels to no-DMA mode + python generate_combined_configs.py \\ + --pte resnet18_quantized.pte \\ + --output layer_configs.h --dram0 62976 --dram1 62976 --no-dma-mode +""" + +import os +import sys +import json +import argparse +from pathlib import Path + +# --------------------------------------------------------------------------- +# Resolve paths +# --------------------------------------------------------------------------- +_SCRIPT_DIR = Path(__file__).resolve().parent +_EXECUTORCH_ROOT = _SCRIPT_DIR.parents[3] # backends/cadence/vision/config_generator -> executorch/ +_EXECUTORCH_SRC = str(_EXECUTORCH_ROOT / 'src' / 'executorch') +_EXECUTORCH_PARENT = str(_EXECUTORCH_ROOT / 'src') + +# Try multiple known flatc locations +_FLATC_CANDIDATES = [ + _EXECUTORCH_ROOT / 'cmake-out' / 'third-party' / 'flatc_ep' / 'bin' / 'flatc', + _EXECUTORCH_ROOT / 'cmake-out-generic-all' / 'third-party' / 'flatc_ep' / 'bin' / 'flatc', + _EXECUTORCH_ROOT / 'pip-out' / 'lib.linux-x86_64-cpython-311' / 'executorch' / 'data' / 'bin' / 'flatc', + _EXECUTORCH_ROOT / 'pip-out' / 'temp.linux-x86_64-cpython-311' / 'cmake-out' / 'third-party' / 'flatc_ep' / 'bin' / 'flatc', +] +_FLATC_DEFAULT = str(next((p for p in _FLATC_CANDIDATES if p.exists()), _FLATC_CANDIDATES[0])) + +# Import conv buffer calculation +sys.path.insert(0, str(_SCRIPT_DIR)) +from generate_idma_buffers import ( + find_max_tile_config, + calculate_buffer_sizes_with_rows, + calculate_buffer_placement, + DRAM_SIZE_0, + DRAM_SIZE_1, +) + +ELEMENT_SIZE_F32 = 4 # float32 bytes + + +# ===================================================================== +# Bootstrap executorch imports (shared with generate_layer_configs.py) +# ===================================================================== + +def _bootstrap_executorch_imports(flatc_path=None): + import types + if _EXECUTORCH_PARENT not in sys.path: + sys.path.insert(0, _EXECUTORCH_PARENT) + if _EXECUTORCH_SRC not in sys.path: + sys.path.insert(0, _EXECUTORCH_SRC) + for pkg, pkg_dir in [ + ('executorch', _EXECUTORCH_SRC), + ('executorch.exir', _EXECUTORCH_SRC + '/exir'), + ]: + if pkg not in sys.modules: + m = types.ModuleType(pkg) + m.__path__ = [pkg_dir] + m.__package__ = pkg + sys.modules[pkg] = m + resolved = flatc_path or _FLATC_DEFAULT + if os.path.isfile(resolved): + os.environ.setdefault('FLATC_EXECUTABLE', resolved) + + +# ===================================================================== +# PTE extraction — conv2d and maxpool +# ===================================================================== + +def extract_layers_from_pte(pte_file, flatc_path=None): + """ + Extract conv2d and maxpool layers from a .pte binary. + + Returns: + (conv_layers, maxpool_layers) + Each is a list of dicts in the internal format. + """ + _bootstrap_executorch_imports(flatc_path) + + from executorch.exir._serialize._program import deserialize_pte_binary + from executorch.exir.schema import KernelCall, Int, IntList, Tensor + + pte_path = Path(pte_file) + print(f"Loading PTE: {pte_path} ...") + + with open(pte_path, 'rb') as f: + pte_file_obj = deserialize_pte_binary(f.read()) + + if hasattr(pte_file_obj, 'program'): + program = pte_file_obj.program + else: + program = pte_file_obj + + plan = program.execution_plan[0] + values = plan.values + + def _tensor(idx): + v = values[idx].val + return v if isinstance(v, Tensor) else None + + def _int_val(idx): + v = values[idx].val + return v.int_val if isinstance(v, Int) else None + + def _intlist_val(idx): + v = values[idx].val + if isinstance(v, IntList): + return [_int_val(i) for i in v.items] + return None + + CONV_OPS = { + 'cadence::quantized_conv2d_nchw', + 'aten::conv2d', + 'aten::convolution', + } + MAXPOOL_OPS = { + 'aten::max_pool2d_with_indices', + 'aten::max_pool2d', + } + + conv_layers = [] + conv_seen = set() + maxpool_layers = [] + maxpool_seen = set() + + for instr in plan.chains[0].instructions: + ia = instr.instr_args + if not isinstance(ia, KernelCall): + continue + op_name = plan.operators[ia.op_index].name + args = ia.args + + # --- Conv2d --- + if op_name in CONV_OPS: + input_t = _tensor(args[0]) + weight_t = _tensor(args[1]) + output_t = _tensor(args[-1]) + if input_t is None or weight_t is None or output_t is None: + continue + + stride = _intlist_val(args[3]) or [1, 1] + padding = _intlist_val(args[4]) or [0, 0] + dilation = _intlist_val(args[5]) or [1, 1] + + _, in_c, in_h, in_w = input_t.sizes + _, out_c, out_h, out_w = output_t.sizes + _oc, _ic, k_h, k_w = weight_t.sizes + + info = { + 'input': (in_w, in_h, in_c), + 'output': (out_w, out_h, out_c), + 'kernel': (k_w, k_h, _ic, _oc), + 'stride': tuple(stride), + 'padding': tuple(padding), + 'dilation':tuple(dilation), + } + key = (info['input'], info['output'], info['kernel'], + info['stride'], info['padding'], info['dilation']) + if key not in conv_seen: + conv_seen.add(key) + conv_layers.append(info) + + # --- MaxPool --- + elif op_name in MAXPOOL_OPS: + input_t = _tensor(args[0]) + # max_pool2d_with_indices: input, kernel_size, stride, padding, dilation, ceil_mode, output, indices + # max_pool2d: input, kernel_size, stride, padding, dilation, ceil_mode, output + if input_t is None: + continue + + kernel_size = _intlist_val(args[1]) or [2, 2] + mp_stride = _intlist_val(args[2]) or kernel_size + mp_padding = _intlist_val(args[3]) or [0, 0] + + _, C, H, W = input_t.sizes + kh, kw = kernel_size[0], kernel_size[1] + sh, sw = mp_stride[0], mp_stride[1] + ph, pw = mp_padding[0], mp_padding[1] + + mp_key = (C, H, W, kh, kw, sh, sw, ph, pw) + if mp_key not in maxpool_seen: + maxpool_seen.add(mp_key) + maxpool_layers.append({ + 'name': f"maxpool_{kh}x{kw}s{sh}_c{C}_{H}x{W}", + 'src_width': W, + 'src_height': H, + 'channels': C, + 'kernel_h': kh, + 'kernel_w': kw, + 'stride_h': sh, + 'stride_w': sw, + 'pad_h': ph, + 'pad_w': pw, + }) + + # Convert conv to internal format + conv_result = [] + for layer_id, info in enumerate(conv_layers): + in_w, in_h, in_c = info['input'] + out_w, out_h, out_c = info['output'] + k_w, k_h, _ic, _oc = info['kernel'] + name = f"conv_{k_h}x{k_w}_s{info['stride'][0]}_ic{in_c}_oc{out_c}" + conv_result.append({ + 'layer_id': layer_id, + 'name': name, + 'input': info['input'], + 'output': info['output'], + 'kernel': info['kernel'], + 'stride': info['stride'], + 'padding': info['padding'], + 'dilation': info['dilation'], + }) + + print(f" Extracted {len(conv_result)} conv layers, {len(maxpool_layers)} maxpool layers") + return conv_result, maxpool_layers + + +# ===================================================================== +# Conv config calculation (reused from generate_layer_configs.py) +# ===================================================================== + +def calculate_conv_config(layer, dram0_size, dram1_size): + """Calculate complete conv config dict for one layer. + Mirrors calculate_layer_config() in generate_layer_configs.py.""" + in_w, in_h, in_c = layer['input'] + out_w, out_h, out_c = layer['output'] + k_w, k_h, _, _ = layer['kernel'] + stride_w, stride_h = layer['stride'] + pad = layer['padding'][0] + dil = layer['dilation'][0] + pad_w = pad_h = pad + + padding = (pad_w, pad_w, pad_h, pad_h, 0, 0) + conv_params = (stride_w, stride_h, 8, 4000, 11, 0, 1, k_h, k_w) + + # Kernel name + if k_h == 7 and k_w == 7 and stride_h == 2: + kernel_name = "7x7j2d1" + elif k_h == 3 and k_w == 3 and stride_h == 1: + kernel_name = "3x3j1d1" + elif k_h == 3 and k_w == 3 and stride_h == 2: + kernel_name = "3x3j2d1" + elif k_h == 1 and k_w == 1 and stride_h == 2: + kernel_name = "1x1j2d1" + elif k_h == 1 and k_w == 1 and stride_h == 1: + kernel_name = "1x1j1d1" + else: + kernel_name = f"{k_w}x{k_h}j{stride_w}d1" + + n_tile_size, output_rows, buffer_sizes = find_max_tile_config( + input_whd=(in_w, in_h, in_c), + output_whd=(out_w, out_h, out_c), + kernel_whdn=(k_w, k_h, in_c, out_c), + padding=padding, + stride_xy=(stride_w, stride_h), + kernel_name=kernel_name, + data_type="S8S8", + dram0_size=dram0_size, + dram1_size=dram1_size, + conv_params=conv_params, + ) + + if buffer_sizes is None or n_tile_size == 0 or output_rows == 0: + # No-DMA fallback + in_dim1_pitch = in_w + 2 * pad_w + in_dim2_pitch = in_dim1_pitch * (in_h + 2 * pad_h) + out_dim1_pitch = out_w + out_dim2_pitch = out_dim1_pitch * out_h + coeff_dim1_pitch = k_w + coeff_dim2_pitch = coeff_dim1_pitch * k_h + coeff_dim3_pitch = coeff_dim2_pitch * in_c + + return { + 'layer_id': layer['layer_id'], 'layer_name': layer['name'], + 'kernel_name': kernel_name + "_no_dma", + 'src_dim1_size': in_w, 'src_dim2_size': in_h, 'src_dim3_size': in_c, + 'src_dim1_pitch': in_w, 'src_dim2_pitch': in_w * in_h, + 'dst_dim1_size': out_w, 'dst_dim2_size': out_h, 'dst_dim3_size': out_c, + 'dst_dim1_pitch': out_w, 'dst_dim2_pitch': out_w * out_h, + 'in_dim1_size': in_w, 'in_dim1_pitch': in_dim1_pitch, + 'in_dim2_size': in_h, 'in_dim2_pitch': in_dim2_pitch, + 'in_dim1_edge1': pad_w, 'in_dim1_edge2': pad_w, + 'in_dim2_edge1': pad_h, 'in_dim2_edge2': pad_h, + 'in_dim3_edge1': 0, 'in_dim3_edge2': 0, + 'in_data_offset': 0, 'in_rows_firstdma': in_h, + 'out_dim1_size': out_w, 'out_dim1_pitch': out_dim1_pitch, + 'out_dim2_size': out_h, 'out_dim2_pitch': out_dim2_pitch, + 'out_dim3_size': out_c, + 'coeff_dim1_size': k_w, 'coeff_dim2_size': k_h, + 'coeff_dim3_size': in_c, 'coeff_dim4_size': out_c, + 'coeff_dim1_pitch': coeff_dim1_pitch, 'coeff_dim2_pitch': coeff_dim2_pitch, + 'coeff_dim3_pitch': coeff_dim3_pitch, + 'bias_dim1_size': out_c, 'bias_dim2_size': 1, + 'outscale_dim1_size': out_c, 'outscale_dim2_size': 1, + 'input_buffer_size': in_dim2_pitch * in_c, + 'coeff_buffer_size': coeff_dim3_pitch * out_c, + 'output_buffer_size': out_dim2_pitch * out_c, + 'bias_buffer_size': out_c * 4, 'outscale_buffer_size': out_c * 2, + 'input_ping_dram': 0, 'input_pong_dram': 0, 'coeff_dram': 0, + 'output_ping_dram': 0, 'output_pong_dram': 0, + 'bias_dram': 0, 'outscale_dram': 0, + 'n_tile_size': out_c, 'n_tiles': 1, 'n_tile_size_last': out_c, + 'height_tiles': 1, 'output_rows': out_h, 'input_rows': in_h, + 'kernel_w': k_w, 'kernel_h': k_h, + 'stride_x': stride_w, 'stride_y': stride_h, + 'padding': pad_w, 'dilation': 1, + 'accum_shift': 8, 'relu_max': 4000, 'relu_min': 0, + 'output_shift': 11, 'output_scale': 0, 'flags': 0, + 'input_zero_point': 0, + 'config_key': f"{in_c}_{in_h}_{in_w}_{out_c}_{k_h}_{k_w}_{out_h}_{out_w}_{stride_h}_{stride_w}_{pad_w}_1", + } + + # DMA mode — use buffer_sizes dict from find_max_tile_config + n_tiles = (out_c + n_tile_size - 1) // n_tile_size + height_tiles = (out_h + output_rows - 1) // output_rows + input_rows = k_h + (output_rows - 1) * stride_h + + placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size) + + dilation = buffer_sizes.get('DILATION', 1) + config = { + 'layer_id': layer['layer_id'], 'layer_name': layer['name'], + 'kernel_name': kernel_name + "_dma", + 'src_dim1_size': buffer_sizes['SRC_DIM1_SIZE'], + 'src_dim2_size': buffer_sizes['SRC_DIM2_SIZE'], + 'src_dim3_size': buffer_sizes['SRC_DIM3_SIZE'], + 'src_dim1_pitch': buffer_sizes['SRC_DIM1_PITCH'], + 'src_dim2_pitch': buffer_sizes['SRC_DIM2_PITCH'], + 'dst_dim1_size': buffer_sizes['DST_DIM1_SIZE'], + 'dst_dim2_size': buffer_sizes['DST_DIM2_SIZE'], + 'dst_dim3_size': out_c, + 'dst_dim1_pitch': buffer_sizes['DST_DIM1_PITCH'], + 'dst_dim2_pitch': buffer_sizes['DST_DIM2_PITCH'], + 'in_dim1_size': buffer_sizes['IN_DIM1_SIZE'], + 'in_dim1_pitch': buffer_sizes['IN_DIM1_PITCH'], + 'in_dim2_size': buffer_sizes['IN_DIM2_SIZE'], + 'in_dim2_pitch': buffer_sizes['IN_DIM2_PITCH'], + 'in_dim1_edge1': padding[0], 'in_dim1_edge2': padding[1], + 'in_dim2_edge1': padding[2], 'in_dim2_edge2': padding[3], + 'in_dim3_edge1': padding[4], 'in_dim3_edge2': padding[5], + 'in_data_offset': buffer_sizes['IN_DATA_OFFSET'], + 'in_rows_firstdma': buffer_sizes['IN_ROWS_FIRSTDMA'], + 'out_dim1_size': buffer_sizes['OUT_DIM1_SIZE'], + 'out_dim1_pitch': buffer_sizes['OUT_DIM1_PITCH'], + 'out_dim2_size': buffer_sizes['OUT_DIM2_SIZE'], + 'out_dim2_pitch': buffer_sizes['OUT_DIM2_PITCH'], + 'out_dim3_size': buffer_sizes['OUT_DIM3_SIZE'], + 'coeff_dim1_size': buffer_sizes['COEFF_DIM1_SIZE'], + 'coeff_dim2_size': buffer_sizes['COEFF_DIM2_SIZE'], + 'coeff_dim3_size': buffer_sizes['COEFF_DIM3_SIZE'], + 'coeff_dim4_size': buffer_sizes['COEFF_DIM4_SIZE'], + 'coeff_dim1_pitch': buffer_sizes['COEFF_DIM1_PITCH'], + 'coeff_dim2_pitch': buffer_sizes['COEFF_DIM2_PITCH'], + 'coeff_dim3_pitch': buffer_sizes['COEFF_DIM3_PITCH'], + 'bias_dim1_size': buffer_sizes['BIAS_DIM1_SIZE'], + 'bias_dim2_size': buffer_sizes['BIAS_DIM2_SIZE'], + 'outscale_dim1_size': buffer_sizes['OUTSCALE_DIM1_SIZE'], + 'outscale_dim2_size': buffer_sizes['OUTSCALE_DIM2_SIZE'], + 'input_buffer_size': buffer_sizes['IN'], + 'coeff_buffer_size': buffer_sizes['COEFF'], + 'output_buffer_size': buffer_sizes['OUT'], + 'bias_buffer_size': buffer_sizes['BIAS'], + 'outscale_buffer_size': buffer_sizes['OUTSCALE'], + 'input_ping_dram': placement.get('IN1_dram', 0), + 'input_pong_dram': placement.get('IN2_dram', 1), + 'coeff_dram': placement.get('COEFF_dram', 0), + 'output_ping_dram': placement.get('OUT1_dram', 1), + 'output_pong_dram': placement.get('OUT2_dram', 1), + 'bias_dram': placement.get('BIAS_dram', 1), + 'outscale_dram': placement.get('OUTSCALE_dram', 1), + 'n_tile_size': buffer_sizes['N_TILE_SIZE'], + 'n_tiles': buffer_sizes['N_TILES'], + 'n_tile_size_last': buffer_sizes['N_TILE_SIZE_LAST'], + 'height_tiles': buffer_sizes['HIGHT_TILES'], + 'output_rows': output_rows, + 'input_rows': input_rows, + 'stride_x': buffer_sizes.get('STRIDEX', stride_w), + 'stride_y': buffer_sizes.get('STRIDEY', stride_h), + 'accum_shift': buffer_sizes.get('ACCUM_SHIFT', 8), + 'relu_max': buffer_sizes.get('RELU_MAX', 4000), + 'relu_min': buffer_sizes.get('RELU_MIN', 0), + 'output_shift': buffer_sizes.get('OUTPUT_SHIFT', 11), + 'output_scale': buffer_sizes.get('OUTPUT_SCALE', 0), + 'dilation': dilation, + 'kernel_w': k_w, 'kernel_h': k_h, + 'padding': pad_w, 'flags': buffer_sizes.get('FLAGS', 0), + 'input_zero_point': 0, + 'config_key': f"{in_c}_{in_h}_{in_w}_{out_c}_{k_h}_{k_w}_{out_h}_{out_w}_{stride_h}_{stride_w}_{pad_w}_{dilation}", + } + return config + + +# ===================================================================== +# Maxpool config calculation (reused from generate_maxpool_configs.py) +# ===================================================================== + +def calculate_maxpool_buffers(layer, c_tile_size, output_rows): + W = layer['src_width'] + H = layer['src_height'] + kh = layer['kernel_h'] + kw = layer['kernel_w'] + sh = layer['stride_h'] + sw = layer['stride_w'] + ph = layer['pad_h'] + pw = layer['pad_w'] + + dst_w = (W + 2 * pw - kw) // sw + 1 + dst_h = (H + 2 * ph - kh) // sh + 1 + + input_rows = (output_rows - 1) * sh + kh + in_tile_w = W + 2 * pw + in_tile_rows = input_rows + 2 * ph + in_tile_plane = in_tile_w * in_tile_rows + in_data_offset = ph * in_tile_w + pw + + out_tile_plane = dst_w * output_rows + input_buf = c_tile_size * in_tile_plane * ELEMENT_SIZE_F32 + output_buf = c_tile_size * out_tile_plane * ELEMENT_SIZE_F32 + + C = layer['channels'] + c_tiles = (C + c_tile_size - 1) // c_tile_size + c_tile_last = C - c_tile_size * (c_tiles - 1) + height_tiles = (dst_h + output_rows - 1) // output_rows + + return { + 'dst_width': dst_w, 'dst_height': dst_h, + 'input_rows': input_rows, + 'in_tile_w': in_tile_w, 'in_tile_rows': in_tile_rows, + 'in_tile_plane': in_tile_plane, 'in_data_offset': in_data_offset, + 'out_tile_w': dst_w, 'out_tile_rows': output_rows, + 'out_tile_plane': out_tile_plane, + 'c_tile_size': c_tile_size, 'c_tiles': c_tiles, + 'c_tile_size_last': c_tile_last, + 'height_tiles': height_tiles, 'output_rows': output_rows, + 'input_buffer_size': input_buf, 'output_buffer_size': output_buf, + } + + +def find_maxpool_tiling(layer, dram0_size, dram1_size): + C = layer['channels'] + dst_h = ((layer['src_height'] + 2 * layer['pad_h'] - layer['kernel_h']) + // layer['stride_h'] + 1) + bank = min(dram0_size, dram1_size) + + best_c, best_r, best_buf = 0, 0, None + for c in range(C, 0, -1): + for r in range(dst_h, 0, -1): + buf = calculate_maxpool_buffers(layer, c, r) + if buf['input_buffer_size'] + buf['output_buffer_size'] <= bank: + if (c > best_c) or (c == best_c and r > best_r): + best_c, best_r, best_buf = c, r, buf + break + if best_c == C: + break + return best_c, best_r, best_buf + + +def build_maxpool_config(layer_id, layer, dram0_size, dram1_size): + c_tile, out_rows, buf = find_maxpool_tiling(layer, dram0_size, dram1_size) + if buf is None: + dst_h = ((layer['src_height'] + 2 * layer['pad_h'] - layer['kernel_h']) + // layer['stride_h'] + 1) + c_tile = layer['channels'] + out_rows = dst_h + buf = calculate_maxpool_buffers(layer, c_tile, out_rows) + + W = layer['src_width'] + H = layer['src_height'] + C = layer['channels'] + + cfg = { + 'layer_id': layer_id, + 'layer_name': layer.get('name', f"maxpool_{layer_id}"), + 'config_key': f"{C}_{H}_{W}_{layer['kernel_h']}_{layer['kernel_w']}_" + f"{layer['stride_h']}_{layer['stride_w']}_" + f"{layer['pad_h']}_{layer['pad_w']}", + 'src_width': W, 'src_height': H, 'channels': C, + 'dst_width': buf['dst_width'], 'dst_height': buf['dst_height'], + 'src_row_pitch': W, 'src_plane_pitch': H * W, + 'dst_row_pitch': buf['dst_width'], + 'dst_plane_pitch': buf['dst_height'] * buf['dst_width'], + 'kernel_h': layer['kernel_h'], 'kernel_w': layer['kernel_w'], + 'stride_h': layer['stride_h'], 'stride_w': layer['stride_w'], + 'pad_h': layer['pad_h'], 'pad_w': layer['pad_w'], + 'in_tile_w': buf['in_tile_w'], 'in_tile_rows': buf['in_tile_rows'], + 'in_tile_plane': buf['in_tile_plane'], + 'in_data_offset': buf['in_data_offset'], + 'out_tile_w': buf['out_tile_w'], 'out_tile_rows': buf['out_tile_rows'], + 'out_tile_plane': buf['out_tile_plane'], + 'c_tile_size': buf['c_tile_size'], 'c_tiles': buf['c_tiles'], + 'c_tile_size_last': buf['c_tile_size_last'], + 'height_tiles': buf['height_tiles'], + 'output_rows': buf['output_rows'], 'input_rows': buf['input_rows'], + 'input_buffer_size': buf['input_buffer_size'], + 'output_buffer_size': buf['output_buffer_size'], + 'input_ping_dram': 0, 'input_pong_dram': 1, + 'output_ping_dram': 1, 'output_pong_dram': 0, + } + return cfg + + +# ===================================================================== +# Combined C header generation +# ===================================================================== + +def generate_combined_header(conv_configs, maxpool_configs, output_file, + dram0_size, dram1_size, no_dma_mode=False): + _dram0 = 0 if no_dma_mode else dram0_size + _dram1 = 0 if no_dma_mode else dram1_size + + with open(output_file, 'w') as f: + f.write("""\ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +""") + # ---------------------------------------------------------- + # DRAM macros + # ---------------------------------------------------------- + f.write(f"#define IDMA_BUFFER_SIZE_DRAM0 ({_dram0}) /* {_dram0 // 1024} KB */\n") + f.write(f"#define IDMA_BUFFER_SIZE_DRAM1 ({_dram1}) /* {_dram1 // 1024} KB */\n\n") + + # =========================================================== + # CONV SECTION + # =========================================================== + f.write("/* " + "=" * 70 + " */\n") + f.write("/* Conv2d configurations */\n") + f.write("/* " + "=" * 70 + " */\n\n") + + f.write("""\ +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +""") + f.write(f"#define NUM_CONV_LAYERS {len(conv_configs)}\n\n") + f.write("static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {\n") + + conv_fields = [ + 'layer_id', 'layer_name', 'kernel_name', 'config_key', + 'src_dim1_size', 'src_dim2_size', 'src_dim3_size', 'src_dim1_pitch', 'src_dim2_pitch', + 'dst_dim1_size', 'dst_dim2_size', 'dst_dim3_size', 'dst_dim1_pitch', 'dst_dim2_pitch', + 'in_dim1_size', 'in_dim1_pitch', 'in_dim2_size', 'in_dim2_pitch', + 'in_dim1_edge1', 'in_dim1_edge2', 'in_dim2_edge1', 'in_dim2_edge2', + 'in_dim3_edge1', 'in_dim3_edge2', 'in_data_offset', 'in_rows_firstdma', + 'out_dim1_size', 'out_dim1_pitch', 'out_dim2_size', 'out_dim2_pitch', 'out_dim3_size', + 'coeff_dim1_size', 'coeff_dim2_size', 'coeff_dim3_size', 'coeff_dim4_size', + 'coeff_dim1_pitch', 'coeff_dim2_pitch', 'coeff_dim3_pitch', + 'bias_dim1_size', 'bias_dim2_size', 'outscale_dim1_size', 'outscale_dim2_size', + 'input_buffer_size', 'coeff_buffer_size', 'output_buffer_size', + 'bias_buffer_size', 'outscale_buffer_size', + 'input_ping_dram', 'input_pong_dram', 'coeff_dram', + 'output_ping_dram', 'output_pong_dram', 'bias_dram', 'outscale_dram', + 'n_tile_size', 'n_tiles', 'n_tile_size_last', + 'height_tiles', 'output_rows', 'input_rows', + 'kernel_w', 'kernel_h', 'stride_x', 'stride_y', 'padding', 'dilation', + 'accum_shift', 'relu_max', 'relu_min', 'output_shift', 'output_scale', 'flags', + 'input_zero_point', + ] + str_fields = {'layer_name', 'kernel_name', 'config_key'} + + for cfg in conv_configs: + f.write(" {\n") + for fld in conv_fields: + val = cfg[fld] + if fld in str_fields: + f.write(f" .{fld} = \"{val}\",\n") + else: + f.write(f" .{fld} = {val},\n") + f.write(" },\n") + f.write("};\n\n") + + # Conv accessors + f.write("""\ +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\\0' && *b == '\\0') return cfg; + } + } + return NULL; +} + +""") + + # =========================================================== + # MAXPOOL SECTION + # =========================================================== + f.write("/* " + "=" * 70 + " */\n") + f.write("/* MaxPool configurations */\n") + f.write("/* " + "=" * 70 + " */\n\n") + + f.write("""\ +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +""") + f.write(f"#define NUM_MAXPOOL_LAYERS {len(maxpool_configs)}\n\n") + f.write("static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = {\n") + + mp_fields = [ + 'layer_id', 'layer_name', 'config_key', + 'src_width', 'src_height', 'channels', + 'dst_width', 'dst_height', + 'src_row_pitch', 'src_plane_pitch', 'dst_row_pitch', 'dst_plane_pitch', + 'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w', + 'in_tile_w', 'in_tile_rows', 'in_tile_plane', 'in_data_offset', + 'out_tile_w', 'out_tile_rows', 'out_tile_plane', + 'c_tile_size', 'c_tiles', 'c_tile_size_last', + 'height_tiles', 'output_rows', 'input_rows', + 'input_buffer_size', 'output_buffer_size', + 'input_ping_dram', 'input_pong_dram', 'output_ping_dram', 'output_pong_dram', + ] + mp_str_fields = {'layer_name', 'config_key'} + + for cfg in maxpool_configs: + f.write(" {\n") + for fld in mp_fields: + val = cfg[fld] + if fld in mp_str_fields: + f.write(f" .{fld} = \"{val}\",\n") + else: + f.write(f" .{fld} = {val},\n") + f.write(" },\n") + f.write("};\n\n") + + # Maxpool accessors + f.write("""\ +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ +""") + + print(f"\nGenerated {output_file}") + print(f" Conv layers: {len(conv_configs)}") + print(f" Maxpool layers: {len(maxpool_configs)}") + + +# ===================================================================== +# CLI +# ===================================================================== + +def main(): + parser = argparse.ArgumentParser( + description='Generate combined conv2d + maxpool DMA config header from PTE files') + parser.add_argument('--pte', nargs='+', required=True, + help='One or more ExecuTorch .pte files') + parser.add_argument('--output', '-o', default='layer_configs.h', + help='Output C header file (default: layer_configs.h)') + parser.add_argument('--dram0', type=int, default=62976, + help='DRAM0 size in bytes (default: 62976)') + parser.add_argument('--dram1', type=int, default=62976, + help='DRAM1 size in bytes (default: 62976)') + parser.add_argument('--flatc', default=None, + help='Path to flatc binary (auto-detected)') + parser.add_argument('--no-dma-mode', action='store_true', default=False, + help='Force all conv kernels to no-DMA mode') + + args = parser.parse_args() + + # Collect layers from all PTE files with deduplication + all_conv = [] + all_maxpool = [] + conv_seen = set() + mp_seen = set() + + for pte_path_str in args.pte: + pte_path = Path(pte_path_str) + if not pte_path.exists(): + print(f"ERROR: PTE file not found: {pte_path}") + return 1 + + print(f"\nExtracting from: {pte_path}") + conv_layers, mp_layers = extract_layers_from_pte(pte_path, flatc_path=args.flatc) + + for l in conv_layers: + key = (l['input'], l['output'], l['kernel'], + l['stride'], l['padding'], l['dilation']) + if key not in conv_seen: + conv_seen.add(key) + l['layer_id'] = len(all_conv) + all_conv.append(l) + else: + print(f" [skip dup conv] {l['name']}") + + for l in mp_layers: + key = (l['channels'], l['src_height'], l['src_width'], + l['kernel_h'], l['kernel_w'], l['stride_h'], l['stride_w'], + l['pad_h'], l['pad_w']) + if key not in mp_seen: + mp_seen.add(key) + all_maxpool.append(l) + else: + print(f" [skip dup maxpool] {l['name']}") + + print(f"\nTotal unique: {len(all_conv)} conv, {len(all_maxpool)} maxpool") + print(f"DRAM budget: DRAM0={args.dram0}B DRAM1={args.dram1}B") + + # Calculate conv configs + print(f"\nCalculating conv configurations...") + conv_configs = [] + for layer in all_conv: + print(f" Conv {layer['layer_id']}: {layer['name']}...") + cfg = calculate_conv_config(layer, args.dram0, args.dram1) + if cfg: + conv_configs.append(cfg) + print(f" [OK] n_tile={cfg['n_tile_size']}, height_tiles={cfg['height_tiles']}, " + f"output_rows={cfg['output_rows']}") + else: + print(f" [FAIL]") + + # Apply no-DMA mode + if args.no_dma_mode: + for cfg in conv_configs: + if cfg['kernel_name'].endswith('_dma'): + cfg['kernel_name'] = cfg['kernel_name'][:-4] + '_no_dma' + print("No-DMA mode: all conv kernels set to _no_dma") + + # Calculate maxpool configs + print(f"\nCalculating maxpool configurations...") + mp_configs = [] + for idx, layer in enumerate(all_maxpool): + print(f" Maxpool {idx}: {layer['name']}...") + cfg = build_maxpool_config(idx, layer, args.dram0, args.dram1) + mp_configs.append(cfg) + print(f" [OK] c_tile={cfg['c_tile_size']}, height_tiles={cfg['height_tiles']}, " + f"output_rows={cfg['output_rows']}") + + # Generate combined header + generate_combined_header(conv_configs, mp_configs, args.output, + args.dram0, args.dram1, args.no_dma_mode) + return 0 + + +if __name__ == '__main__': + sys.exit(main() or 0) diff --git a/backends/cadence/vision/config_generator/generate_idma_buffers.py b/backends/cadence/vision/config_generator/generate_idma_buffers.py new file mode 100644 index 00000000000..3dbcd9cfe17 --- /dev/null +++ b/backends/cadence/vision/config_generator/generate_idma_buffers.py @@ -0,0 +1,1478 @@ +#!/usr/bin/env python3 +""" +Generate IDMA buffer size definitions for convolution operations. + +This script calculates buffer sizes based on: +- Processing all width elements in one go +- Processing 2 output rows in one go +- Processing all output channels in one go +""" + +# DRAM Size Configuration (in bytes) +DRAM_SIZE_0 = 32 * 1024 # 128 KB for DRAM0 +DRAM_SIZE_1 = 32 * 1024 # 64 KB for DRAM1 +def find_max_tile_config(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name="7x7j2d1", data_type="S8S8", dram0_size=None, dram1_size=None, conv_params=None, conv_flags=0): + """ + Find maximum output channels and output rows that fit in available DRAM. + + Strategy: + 1. Start with n_tile_size=1, output_rows=1 + 2. Increase n_tile_size until all output channels are covered or memory is full + 3. Once all channels fit, increase output_rows + + Args: + input_whd: Tuple (width, height, depth) of input + output_whd: Tuple (width, height, depth) of output + kernel_whdn: Tuple (width, height, depth, num_filters) of kernel + padding: Tuple (dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2) + stride_xy: Tuple (stride_x, stride_y) + kernel_name: String identifier for kernel + data_type: Data type string + dram0_size: Size of DRAM0 in bytes + dram1_size: Size of DRAM1 in bytes + conv_params: Tuple of (strideX, strideY, accumShift, reluMax, outputShift, outputScale, dilation, kernelHeight, kernelWidth) + conv_flags: Integer flags value (e.g., CNN_CONV_FLAG_RELU) + + Returns: + Tuple (best_n_tile_size, best_output_rows, buffer_sizes_dict) + """ + if dram0_size is None: + dram0_size = DRAM_SIZE_0 + if dram1_size is None: + dram1_size = DRAM_SIZE_1 + + output_w, output_h, output_d = output_whd + kernel_w, kernel_h, kernel_d, kernel_n = kernel_whdn + stride_x, stride_y = stride_xy + + print(f"\n=== Finding Maximum Tile Configuration ===") + print(f"Kernel: {kernel_name}") + print(f"Total output channels: {kernel_n}") + print(f"DRAM0: {dram0_size} bytes, DRAM1: {dram1_size} bytes") + print() + + best_n_tile_size = 1 + best_output_rows = 2 # Minimum output rows should be 2 + best_buffer_sizes = None + all_channels_fit = False + best_tile_balance = float('inf') # Track tile size balance (lower is better) + + # Phase 1: Scan ALL n_tile_size values to find best balanced config + current_output_rows = 2 + last_fit_n_tile_size = 0 + + for n_tile_size in range(1, kernel_n + 1): + # Temporarily modify output_whd for this iteration + temp_output_whd = (output_w, output_h, output_d) + + # Calculate buffer sizes with current configuration + buffer_sizes = calculate_buffer_sizes_with_rows( + input_whd, temp_output_whd, kernel_whdn, padding, stride_xy, + kernel_name, data_type, n_tile_size, current_output_rows, + conv_params, conv_flags + ) + + # Check if it fits in DRAM + placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size) + + if placement['total_fits']: + last_fit_n_tile_size = n_tile_size + + # Calculate tile balance: difference between first tile and last tile + n_tiles = (kernel_n + n_tile_size - 1) // n_tile_size + last_tile_size = kernel_n - (n_tile_size * (n_tiles - 1)) + tile_balance = abs(n_tile_size - last_tile_size) + + # Update best if this is more balanced than current best + # OR if balance is same but tile size is larger (prefer fewer, larger tiles) + if tile_balance < best_tile_balance or \ + (tile_balance == best_tile_balance and n_tile_size > best_n_tile_size): + best_n_tile_size = n_tile_size + best_output_rows = current_output_rows + best_buffer_sizes = buffer_sizes + best_tile_balance = tile_balance + + if n_tile_size >= kernel_n: + all_channels_fit = True + print(f" All {kernel_n} output channels fit with {current_output_rows} output rows") + break + else: + # Stop scanning if we've found at least one config and current doesn't fit + if last_fit_n_tile_size > 0: + print(f" n_tile_size={n_tile_size}, output_rows={current_output_rows}: Does NOT fit") + break + + # Phase 2: If all channels fit, try increasing output_rows + if all_channels_fit: + print(f"\n Phase 2: Increasing output rows (all channels fit)...") + + for output_rows in range(3, output_h + 1): + buffer_sizes = calculate_buffer_sizes_with_rows( + input_whd, temp_output_whd, kernel_whdn, padding, stride_xy, + kernel_name, data_type, best_n_tile_size, output_rows, + conv_params, conv_flags + ) + + placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size) + + if placement['total_fits']: + best_output_rows = output_rows + best_buffer_sizes = buffer_sizes + print(f" output_rows={output_rows}: Fits!") + else: + print(f" output_rows={output_rows}: Does NOT fit") + break + + print(f"\n=== Best Configuration Found ===") + if best_buffer_sizes is None: + print(f"\033[91m ERROR: No configuration fits in available DRAM!\033[0m") + print(f"\033[91m DRAM0: {dram0_size} bytes, DRAM1: {dram1_size} bytes\033[0m") + print(f"\033[91m Minimum required: n_tile_size=1, output_rows=1\033[0m") + print(f"\033[91m Setting all buffer sizes to 0 for kernel {kernel_name}\033[0m") + + # Create a minimal buffer_sizes dict with all zeros + best_buffer_sizes = { + 'IN': 0, 'COEFF': 0, 'COEFF_TILE_SIZE_LAST': 0, 'OUT': 0, 'BIAS': 0, 'OUTSCALE': 0, + 'padding': padding, 'kernel_name': kernel_name, 'data_type': data_type, + 'SRC_DIM1_SIZE': 0, 'SRC_DIM1_PITCH': 0, 'SRC_DIM2_SIZE': 0, 'SRC_DIM2_PITCH': 0, 'SRC_DIM3_SIZE': 0, + 'DST_DIM1_SIZE': 0, 'DST_DIM1_PITCH': 0, 'DST_DIM2_SIZE': 0, 'DST_DIM2_PITCH': 0, + 'IN_DIM1_SIZE': 0, 'IN_DIM1_PITCH': 0, 'IN_DIM2_SIZE': 0, 'IN_DIM2_PITCH': 0, + 'IN_DATA_OFFSET': 0, 'IN_ROWS_FIRSTDMA': 0, + 'OUT_DIM1_SIZE': 0, 'OUT_DIM1_PITCH': 0, 'OUT_DIM2_SIZE': 0, 'OUT_DIM2_PITCH': 0, 'OUT_DIM3_SIZE': 0, + 'COEFF_DIM1_SIZE': 0, 'COEFF_DIM2_SIZE': 0, 'COEFF_DIM3_SIZE': 0, 'COEFF_DIM4_SIZE': 0, + 'COEFF_DIM1_PITCH': 0, 'COEFF_DIM2_PITCH': 0, 'COEFF_DIM3_PITCH': 0, + 'BIAS_DIM1_SIZE': 0, 'BIAS_DIM2_SIZE': 0, + 'OUTSCALE_DIM1_SIZE': 0, 'OUTSCALE_DIM2_SIZE': 0, + 'N_TILE_SIZE': 0, 'N_TILES': 0, 'N_TILE_SIZE_LAST': 0, 'HIGHT_TILES': 0, + 'details': {'input_buff_whd': (0, 0, 0), 'input_rows_needed': 0, 'output_buff_whd': (0, 0, 0)} + } + best_n_tile_size = 0 + best_output_rows = 0 + + print(f" n_tile_size: {best_n_tile_size} (out of {kernel_n} total channels)") + print(f" output_rows: {best_output_rows}") + print() + + return best_n_tile_size, best_output_rows, best_buffer_sizes + +def calculate_buffer_sizes_with_rows(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name="7x7j2d1", data_type="S8S8", n_tile_size=None, output_rows_per_iteration=2, conv_params=None, conv_flags=0): + """ + Calculate IDMA buffer sizes with configurable output rows per iteration. + + Args: + Same as calculate_buffer_sizes, plus: + output_rows_per_iteration: Number of output rows to process in one iteration + conv_params: Tuple of (strideX, strideY, accumShift, reluMax, outputShift, outputScale, dilation, kernelHeight, kernelWidth) + conv_flags: Integer flags value (e.g., CNN_CONV_FLAG_RELU) + + Returns: + Dictionary with buffer sizes + """ + input_w, input_h, input_d = input_whd + output_w, output_h, output_d = output_whd + kernel_w, kernel_h, kernel_d, kernel_n = kernel_whdn + dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2 = padding + stride_x, stride_y = stride_xy + + # Calculate input tile dimensions + # For width (DIM1): we always process full input width (no horizontal tiling) + input_dim1_size = input_w # Full width, no horizontal tiling + + # Calculate input buffer size + # For N output rows, we need enough input rows to cover them with the kernel + input_rows_needed = (output_rows_per_iteration - 1) * stride_y + kernel_h + + # Input buffer dimensions (WHD format) + input_buff_w = input_dim1_size + dim1_edge1 + dim1_edge2 + input_buff_h = input_rows_needed + input_buff_d = input_d + dim3_edge1 + dim3_edge2 + + # Input buffer size in bytes + input_buff_size = input_buff_w * input_buff_h * input_buff_d + + # Tiling parameters + if n_tile_size is None: + n_tile_size_val = kernel_n + n_tiles = 1 + n_tile_size_last = kernel_n + else: + n_tile_size_val = n_tile_size + n_tiles = (kernel_n + n_tile_size - 1) // n_tile_size + n_tile_size_last = kernel_n - (n_tile_size * (n_tiles - 1)) + + # Coefficient buffer size + coeff_buff_size = kernel_w * kernel_h * kernel_d * n_tile_size_val + + # Coefficient tile size for last tile + coeff_tile_size_last = kernel_w * kernel_h * kernel_d * n_tile_size_last + + # Calculate output buffer size + output_buff_w = output_w + output_buff_h = output_rows_per_iteration + output_buff_d = n_tile_size_val + output_buff_size = output_buff_w * output_buff_h * output_buff_d + + # Bias and outscale buffers + bias_buff_size = kernel_n * 4 # S32 + outscale_buff_size = kernel_n * 2 # U16 + + # Calculate tile dimensions and pitches +# Calculate tile dimensions and pitches + src_dim1_size = input_w + src_dim1_pitch = input_w + src_dim2_size = input_h + src_dim2_pitch = input_w * input_h + src_dim3_size = input_d + + dst_dim1_size = output_w + dst_dim1_pitch = output_w + dst_dim2_size = output_h + dst_dim2_pitch = output_w * output_h + + in_dim1_size = input_dim1_size + in_dim1_pitch = input_buff_w # DIM1_PITCH = row size + dim1 padding + in_dim2_size = input_rows_needed + in_dim2_pitch = in_dim2_size * in_dim1_pitch # DIM2_PITCH = DIM2_SIZE * DIM1_PITCH + + in_data_offset = (dim2_edge1 * in_dim1_pitch) + dim1_edge1 + in_rows_firstdma = input_rows_needed - dim2_edge1 + + out_dim1_size = output_w + out_dim1_pitch = output_w + out_dim2_size = output_rows_per_iteration + out_dim2_pitch = output_w * output_rows_per_iteration + out_dim3_size = n_tile_size_val + + coeff_dim1_size = kernel_w + coeff_dim2_size = kernel_h + coeff_dim3_size = kernel_d + coeff_dim4_size = kernel_n + coeff_dim1_pitch = kernel_w + coeff_dim2_pitch = kernel_w * kernel_h + coeff_dim3_pitch = kernel_w * kernel_h * kernel_d + + bias_dim1_size = kernel_n + bias_dim2_size = 1 + + outscale_dim1_size = kernel_n + outscale_dim2_size = 1 + + height_tiles = (output_h + output_rows_per_iteration - 1) // output_rows_per_iteration + + result = { + 'IN': input_buff_size, + 'COEFF': coeff_buff_size, + 'COEFF_TILE_SIZE_LAST': coeff_tile_size_last, + 'OUT': output_buff_size, + 'BIAS': bias_buff_size, + 'OUTSCALE': outscale_buff_size, + 'padding': padding, + 'kernel_name': kernel_name, + 'data_type': data_type, + 'SRC_DIM1_SIZE': src_dim1_size, + 'SRC_DIM1_PITCH': src_dim1_pitch, + 'SRC_DIM2_SIZE': src_dim2_size, + 'SRC_DIM2_PITCH': src_dim2_pitch, + 'SRC_DIM3_SIZE': src_dim3_size, + 'DST_DIM1_SIZE': dst_dim1_size, + 'DST_DIM1_PITCH': dst_dim1_pitch, + 'DST_DIM2_SIZE': dst_dim2_size, + 'DST_DIM2_PITCH': dst_dim2_pitch, + 'IN_DIM1_SIZE': in_dim1_size, + 'IN_DIM1_PITCH': in_dim1_pitch, + 'IN_DIM2_SIZE': in_dim2_size, + 'IN_DIM2_PITCH': in_dim2_pitch, + 'IN_DATA_OFFSET': in_data_offset, + 'IN_ROWS_FIRSTDMA': in_rows_firstdma, + 'OUT_DIM1_SIZE': out_dim1_size, + 'OUT_DIM1_PITCH': out_dim1_pitch, + 'OUT_DIM2_SIZE': out_dim2_size, + 'OUT_DIM2_PITCH': out_dim2_pitch, + 'OUT_DIM3_SIZE': out_dim3_size, + 'COEFF_DIM1_SIZE': coeff_dim1_size, + 'COEFF_DIM2_SIZE': coeff_dim2_size, + 'COEFF_DIM3_SIZE': coeff_dim3_size, + 'COEFF_DIM4_SIZE': coeff_dim4_size, + 'COEFF_DIM1_PITCH': coeff_dim1_pitch, + 'COEFF_DIM2_PITCH': coeff_dim2_pitch, + 'COEFF_DIM3_PITCH': coeff_dim3_pitch, + 'BIAS_DIM1_SIZE': bias_dim1_size, + 'BIAS_DIM2_SIZE': bias_dim2_size, + 'OUTSCALE_DIM1_SIZE': outscale_dim1_size, + 'OUTSCALE_DIM2_SIZE': outscale_dim2_size, + 'N_TILE_SIZE': n_tile_size_val, + 'N_TILES': n_tiles, + 'N_TILE_SIZE_LAST': n_tile_size_last, + 'HIGHT_TILES': height_tiles, + 'details': { + 'input_buff_whd': (input_buff_w, input_buff_h, input_buff_d), + 'input_rows_needed': input_rows_needed, + 'output_buff_whd': (output_buff_w, output_buff_h, output_buff_d), + } + } + + # Add convolution parameters if provided + if conv_params is not None: + strideX, strideY, accumShift, reluMax, outputShift, outputScale, dilation, kernelHeight, kernelWidth = conv_params + result.update({ + 'STRIDEX': strideX, + 'STRIDEY': strideY, + 'ACCUM_SHIFT': accumShift, + 'RELU_MAX': reluMax, + 'RELU_MIN': 0, # Default minimum + 'OUTPUT_SHIFT': outputShift, + 'OUTPUT_SCALE': outputScale, + 'DILATION': dilation, + 'KERNEL_HEIGHT': kernelHeight, + 'KERNEL_WIDTH': kernelWidth, + 'FLAGS': conv_flags, + }) + + return result + +def calculate_buffer_sizes(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name="7x7j2d1", data_type="S8S8", n_tile_size=None): + """ + Calculate IDMA buffer sizes for convolution (uses default 2 output rows). + + This is a wrapper around calculate_buffer_sizes_with_rows with output_rows=2. + """ + return calculate_buffer_sizes_with_rows(input_whd, output_whd, kernel_whdn, padding, stride_xy, kernel_name, data_type, n_tile_size, output_rows_per_iteration=2) + """ + Calculate IDMA buffer sizes for convolution. + + Args: + input_whd: Tuple (width, height, depth) of input + output_whd: Tuple (width, height, depth) of output + kernel_whdn: Tuple (width, height, depth, num_filters) of kernel + padding: Tuple (dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2) + stride_xy: Tuple (stride_x, stride_y) + kernel_name: String identifier for kernel (e.g., "7x7j2d1") + data_type: Data type string (e.g., "S8S8") + n_tile_size: Number of output channels per tile (None = all channels) + + Returns: + Dictionary with buffer sizes + """ + input_w, input_h, input_d = input_whd + output_w, output_h, output_d = output_whd + kernel_w, kernel_h, kernel_d, kernel_n = kernel_whdn + dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2 = padding + stride_x, stride_y = stride_xy + + # Assumptions: + # - Process all width elements in one go + # - Process 2 output rows in one go + # - Process all output channels in one go + + output_rows_per_iteration = 2 + + # Calculate input tile dimensions + # DIM1_SIZE for input processing + input_dim1_size = input_w - stride_x + 1 + + # Calculate input buffer size + # For 2 output rows, we need enough input rows to cover them with the kernel + input_rows_needed = (output_rows_per_iteration - 1) * stride_y + kernel_h + + # Input buffer dimensions (WHD format) + # Width includes padding + input_buff_w = input_dim1_size + dim1_edge1 + dim1_edge2 + # Height is just the rows needed (padding is NOT added to height for buffer calculation) + input_buff_h = input_rows_needed + # Depth includes padding + input_buff_d = input_d + dim3_edge1 + dim3_edge2 + + # Input buffer size in bytes (assuming 1 byte per element for S8) + input_buff_size = input_buff_w * input_buff_h * input_buff_d + + # Tiling parameters (calculate early to use in buffer sizes) + if n_tile_size is None: + # No tiling - process all output channels at once + n_tile_size_val = kernel_n + n_tiles = 1 + n_tile_size_last = kernel_n + else: + # Calculate number of tiles needed + n_tile_size_val = n_tile_size + n_tiles = (kernel_n + n_tile_size - 1) // n_tile_size # Ceiling division + n_tile_size_last = kernel_n - (n_tile_size * (n_tiles - 1)) + + # Coefficient buffer size (only one tile worth of coefficients at a time) + coeff_buff_size = kernel_w * kernel_h * kernel_d * n_tile_size_val + + # Calculate coefficient tile size for last tile + coeff_tile_size_last = kernel_w * kernel_h * kernel_d * n_tile_size_last + + # Calculate output buffer size (WHD format) + # Process all width, 2 rows, n_tile_size channels + output_buff_w = output_w + output_buff_h = output_rows_per_iteration + output_buff_d = n_tile_size_val # Use tile size instead of total output channels + output_buff_size = output_buff_w * output_buff_h * output_buff_d + + # Calculate bias buffer size (S32 = 4 bytes per element) + # One bias value per output channel (for all channels, not just one tile) + bias_buff_size = kernel_n * 4 # S32 uses 4 bytes + + # Calculate output scale buffer size (U16 = 2 bytes per element) + # One scale value per output channel (for all channels, not just one tile) + outscale_buff_size = kernel_n * 2 # U16 uses 2 bytes + + # Calculate tile dimensions and pitches + # Source tile parameters (original input dimensions) + src_dim1_size = input_w + src_dim1_pitch = input_w + src_dim2_size = input_h + src_dim2_pitch = input_w * input_h + src_dim3_size = input_d + + # Destination tile parameters (original output dimensions) + dst_dim1_size = output_w + dst_dim1_pitch = output_w + dst_dim2_size = output_h + dst_dim2_pitch = output_w * output_h + + # Input tile (WHD format with padding) + input_dim1_size = input_w - stride_x + 1 # Width for single tile processing + input_dim1_pitch = input_dim1_size + dim1_edge1 + dim1_edge2 # Width with padding + input_dim2_size = input_rows_needed # Number of input rows (kernel_h + (output_rows-1)*stride_y) + input_dim2_pitch = input_dim1_pitch * input_rows_needed # Pitch for next depth plane (rows × width) + + # Calculate data offset (padding offset in the buffer) + # Offset = (top_padding_rows * pitch) + left_padding_pixels + input_data_offset = (dim2_edge1 * input_dim1_pitch) + dim1_edge1 + + # Calculate rows for first DMA (excludes top padding) + input_rows_firstdma = input_rows_needed - dim2_edge1 + + # Output tile (WHD format) + output_dim1_size = output_buff_w + output_dim1_pitch = output_buff_w + output_dim2_size = output_rows_per_iteration + output_dim2_pitch = output_rows_per_iteration * output_dim1_pitch # Output rows in one go × width + output_dim3_size = n_tile_size_val # Use tile size + + # Coefficient tile parameters (WHDN format) + coeff_dim1_size = kernel_w + coeff_dim2_size = kernel_h + coeff_dim3_size = kernel_d + coeff_dim4_size = kernel_n + coeff_dim1_pitch = kernel_w + coeff_dim2_pitch = kernel_w * kernel_h + coeff_dim3_pitch = kernel_w * kernel_h * kernel_d + + # Bias array parameters + bias_dim1_size = kernel_n + bias_dim2_size = 1 + + # Output scale array parameters + outscale_dim1_size = kernel_n + outscale_dim2_size = 1 + + # Height tiles (number of iterations for output height) + height_tiles = output_h // output_rows_per_iteration + + return { + 'IN': input_buff_size, + 'COEFF': coeff_buff_size, + 'COEFF_TILE_SIZE_LAST': coeff_tile_size_last, + 'OUT': output_buff_size, + 'BIAS': bias_buff_size, + 'OUTSCALE': outscale_buff_size, + 'kernel_name': kernel_name, + 'data_type': data_type, + 'padding': padding, + 'SRC_DIM1_SIZE': src_dim1_size, + 'SRC_DIM1_PITCH': src_dim1_pitch, + 'SRC_DIM2_SIZE': src_dim2_size, + 'SRC_DIM2_PITCH': src_dim2_pitch, + 'SRC_DIM3_SIZE': src_dim3_size, + 'DST_DIM1_SIZE': dst_dim1_size, + 'DST_DIM1_PITCH': dst_dim1_pitch, + 'DST_DIM2_SIZE': dst_dim2_size, + 'DST_DIM2_PITCH': dst_dim2_pitch, + 'DIM1_SIZE': input_dim1_size, + 'DIM1_PITCH': input_dim1_pitch, + 'DIM2_SIZE': input_dim2_size, + 'DIM2_PITCH': input_dim2_pitch, + 'IN_DATA_OFFSET': input_data_offset, + 'IN_ROWS_FIRSTDMA': input_rows_firstdma, + 'OUT_DIM1_SIZE': output_dim1_size, + 'OUT_DIM1_PITCH': output_dim1_pitch, + 'OUT_DIM2_SIZE': output_dim2_size, + 'OUT_DIM2_PITCH': output_dim2_pitch, + 'OUT_DIM3_SIZE': output_dim3_size, + 'COEFF_DIM1_SIZE': coeff_dim1_size, + 'COEFF_DIM2_SIZE': coeff_dim2_size, + 'COEFF_DIM3_SIZE': coeff_dim3_size, + 'COEFF_DIM4_SIZE': coeff_dim4_size, + 'COEFF_DIM1_PITCH': coeff_dim1_pitch, + 'COEFF_DIM2_PITCH': coeff_dim2_pitch, + 'COEFF_DIM3_PITCH': coeff_dim3_pitch, + 'BIAS_DIM1_SIZE': bias_dim1_size, + 'BIAS_DIM2_SIZE': bias_dim2_size, + 'OUTSCALE_DIM1_SIZE': outscale_dim1_size, + 'OUTSCALE_DIM2_SIZE': outscale_dim2_size, + 'N_TILE_SIZE': n_tile_size_val, + 'N_TILES': n_tiles, + 'N_TILE_SIZE_LAST': n_tile_size_last, + 'HIGHT_TILES': height_tiles, + 'details': { + 'input_buff_whd': (input_buff_w, input_buff_h, input_buff_d), + 'input_rows_needed': input_rows_needed, + 'output_buff_whd': (output_buff_w, output_buff_h, output_buff_d), + } + } + + +def generate_header_content(buffer_sizes, header_guard="CONVIDMA_BUFFERS_H_"): + """ + Generate C header file content with buffer size definitions. + + Args: + buffer_sizes: Dictionary from calculate_buffer_sizes() + header_guard: Header guard name + + Returns: + String containing header file content + """ + kernel_name = buffer_sizes['kernel_name'] + data_type = buffer_sizes['data_type'] + + header = f"""/* + * convIdma_buffers.h + * + * Auto-generated buffer size definitions + */ + +#ifndef {header_guard} +#define {header_guard} + +// ============================================================================ +// IDMA Buffer Sizes and Tile Parameters for convVQ3D_{kernel_name}_{data_type}_MOW_WHD +// ============================================================================ + +// SRC tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_SIZE {buffer_sizes['SRC_DIM1_SIZE']} // input width +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_PITCH {buffer_sizes['SRC_DIM1_PITCH']} // +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_SIZE {buffer_sizes['SRC_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_PITCH {buffer_sizes['SRC_DIM2_PITCH']} // {buffer_sizes['SRC_DIM1_SIZE']}*{buffer_sizes['SRC_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE {buffer_sizes['SRC_DIM3_SIZE']} + +// DST tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_SIZE {buffer_sizes['DST_DIM1_SIZE']} // input width +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_PITCH {buffer_sizes['DST_DIM1_PITCH']} // +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE {buffer_sizes['DST_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_PITCH {buffer_sizes['DST_DIM2_PITCH']} // {buffer_sizes['DST_DIM1_SIZE']}*{buffer_sizes['DST_DIM2_SIZE']} + + +// Input tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_SIZE {buffer_sizes['IN_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_PITCH {buffer_sizes['IN_DIM1_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE {buffer_sizes['IN_DIM2_SIZE']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE + ((IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE-1)* stride) +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_PITCH {buffer_sizes['IN_DIM2_PITCH']} + + +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DATA_OFFSET {buffer_sizes['IN_DATA_OFFSET']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_ROWS_FIRSTDMA {buffer_sizes['IN_ROWS_FIRSTDMA']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE - padding rows +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_FRAME_PTR 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_STATUS_FLAGS 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_COORD 0 + +// Output tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_SIZE {buffer_sizes['OUT_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_PITCH {buffer_sizes['OUT_DIM1_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE {buffer_sizes['OUT_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_PITCH {buffer_sizes['OUT_DIM2_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_SIZE {buffer_sizes['N_TILE_SIZE']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_FRAME_PTR 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_STATUS_FLAGS 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE2 0 + +//coefficient tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_SIZE {buffer_sizes['COEFF_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE {buffer_sizes['COEFF_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_SIZE {buffer_sizes['COEFF_DIM3_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE {buffer_sizes['COEFF_DIM4_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_PITCH {buffer_sizes['COEFF_DIM1_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_PITCH {buffer_sizes['COEFF_DIM2_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_PITCH {buffer_sizes['COEFF_DIM3_PITCH']} + +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_FRAME_PTR 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_STATUS_FLAGS 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_COORD 0 + +//bias array parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM1_SIZE {buffer_sizes['BIAS_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM2_SIZE {buffer_sizes['BIAS_DIM2_SIZE']} + +//output scale array parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM1_SIZE {buffer_sizes['OUTSCALE_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM2_SIZE {buffer_sizes['OUTSCALE_DIM2_SIZE']} + +// Buffer sizes +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN {buffer_sizes['IN']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF {buffer_sizes['COEFF']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT {buffer_sizes['OUT']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_BIAS {buffer_sizes['BIAS']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE {buffer_sizes['OUTSCALE']} + +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILES {buffer_sizes['N_TILES']} // round_toward positive(IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE) +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_HIGHT_TILES {buffer_sizes['HIGHT_TILES']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE {buffer_sizes['N_TILE_SIZE']} // take this as input aas of now (contstant 22 for 3x3 conv and constant 64 for 7x7 conv) +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE_LAST {buffer_sizes['N_TILE_SIZE_LAST']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE - IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE + +#endif /* {header_guard} */ +""" + return header + + +def align_to_64(size): + """Round up size to next 64-byte boundary for alignment.""" + return ((size + 63) // 64) * 64 + + +def calculate_buffer_placement(buffer_sizes, dram0_size=32*1024, dram1_size=32*1024): + """ + Calculate optimal buffer placement in DRAM0 and DRAM1 for ping-pong architecture. + + Strategy: + 1. Try default placement: input/coeff in DRAM0, output/bias/outscale in DRAM1 + 2. If DRAM0 overflows, move coefficient to DRAM1 (if it fits) + 3. If DRAM1 overflows, move bias/outscale to DRAM0 (if it fits) + 4. Report best fit or overflow scenario + + Note: All buffer sizes are aligned to 64 bytes to account for alignment overhead. + + Args: + buffer_sizes: Dictionary from calculate_buffer_sizes() + dram0_size: Size of DRAM0 in bytes (default 32KB) + dram1_size: Size of DRAM1 in bytes (default: use global DRAM_SIZE_1) + + Returns: + Dictionary with buffer placement information + """ + # Use global DRAM sizes if not specified + if dram0_size is None: + dram0_size = DRAM_SIZE_0 + if dram1_size is None: + dram1_size = DRAM_SIZE_1 + + # Ping-pong buffers require 2x allocation + # Align each buffer to 64 bytes to account for alignment overhead + input_ping = align_to_64(buffer_sizes['IN']) + input_pong = align_to_64(buffer_sizes['IN']) + coeff = align_to_64(buffer_sizes['COEFF']) + output_ping = align_to_64(buffer_sizes['OUT']) + output_pong = align_to_64(buffer_sizes['OUT']) + bias = align_to_64(buffer_sizes['BIAS']) + outscale = align_to_64(buffer_sizes['OUTSCALE']) + + # CRITICAL: Check if any single buffer exceeds DRAM bank size + # A single buffer cannot be split across banks, so each must fit individually + max_bank_size = max(dram0_size, dram1_size) + if coeff > max_bank_size: + # Coefficient buffer too large - cannot fit in any single DRAM bank + return { + 'strategy': 'FAIL_COEFF_TOO_LARGE', + 'dram0_allocation': [], + 'dram1_allocation': [], + 'dram0_used': 0, + 'dram1_used': 0, + 'dram0_size': dram0_size, + 'dram1_size': dram1_size, + 'dram0_free': dram0_size, + 'dram1_free': dram1_size, + 'dram0_fits': False, + 'dram1_fits': False, + 'total_fits': False, + 'error': f'Coefficient buffer ({coeff} bytes) exceeds max DRAM bank size ({max_bank_size} bytes)' + } + if input_ping > max_bank_size: + return { + 'strategy': 'FAIL_INPUT_TOO_LARGE', + 'dram0_allocation': [], + 'dram1_allocation': [], + 'dram0_used': 0, + 'dram1_used': 0, + 'dram0_size': dram0_size, + 'dram1_size': dram1_size, + 'dram0_free': dram0_size, + 'dram1_free': dram1_size, + 'dram0_fits': False, + 'dram1_fits': False, + 'total_fits': False, + 'error': f'Input buffer ({input_ping} bytes) exceeds max DRAM bank size ({max_bank_size} bytes)' + } + if output_ping > max_bank_size: + return { + 'strategy': 'FAIL_OUTPUT_TOO_LARGE', + 'dram0_allocation': [], + 'dram1_allocation': [], + 'dram0_used': 0, + 'dram1_used': 0, + 'dram0_size': dram0_size, + 'dram1_size': dram1_size, + 'dram0_free': dram0_size, + 'dram1_free': dram1_size, + 'dram0_fits': False, + 'dram1_fits': False, + 'total_fits': False, + 'error': f'Output buffer ({output_ping} bytes) exceeds max DRAM bank size ({max_bank_size} bytes)' + } + + # Strategy 1: Default placement - input/coeff in DRAM0, output/bias/outscale in DRAM1 + strategy = "default" + dram0_allocation = [ + ('input_ping', input_ping), + ('input_pong', input_pong), + ('coeff', coeff) + ] + dram1_allocation = [ + ('output_ping', output_ping), + ('output_pong', output_pong), + ('bias', bias), + ('outscale', outscale) + ] + + dram0_used = sum(size for _, size in dram0_allocation) + dram1_used = sum(size for _, size in dram1_allocation) + dram0_fits = dram0_used <= dram0_size + dram1_fits = dram1_used <= dram1_size + + # Strategy 2: If DRAM0 overflows, try moving coefficient to DRAM1 + if not dram0_fits and (dram1_used + coeff <= dram1_size): + strategy = "coeff_to_dram1" + dram0_allocation = [ + ('input_ping', input_ping), + ('input_pong', input_pong) + ] + dram1_allocation = [ + ('coeff', coeff), + ('output_ping', output_ping), + ('output_pong', output_pong), + ('bias', bias), + ('outscale', outscale) + ] + dram0_used = sum(size for _, size in dram0_allocation) + dram1_used = sum(size for _, size in dram1_allocation) + dram0_fits = dram0_used <= dram0_size + dram1_fits = dram1_used <= dram1_size + + # Strategy 3: If DRAM1 overflows, try moving bias/outscale to DRAM0 + elif not dram1_fits and (dram0_used + bias + outscale <= dram0_size): + strategy = "bias_outscale_to_dram0" + dram0_allocation = [ + ('input_ping', input_ping), + ('input_pong', input_pong), + ('coeff', coeff), + ('bias', bias), + ('outscale', outscale) + ] + dram1_allocation = [ + ('output_ping', output_ping), + ('output_pong', output_pong) + ] + dram0_used = sum(size for _, size in dram0_allocation) + dram1_used = sum(size for _, size in dram1_allocation) + dram0_fits = dram0_used <= dram0_size + dram1_fits = dram1_used <= dram1_size + + # Strategy 4: Try combined optimization - coeff+bias+outscale to DRAM1 + if not (dram0_fits and dram1_fits): + temp_dram0 = [('input_ping', input_ping), ('input_pong', input_pong)] + temp_dram1 = [('coeff', coeff), ('output_ping', output_ping), ('output_pong', output_pong), + ('bias', bias), ('outscale', outscale)] + temp_dram0_used = sum(size for _, size in temp_dram0) + temp_dram1_used = sum(size for _, size in temp_dram1) + + if temp_dram0_used <= dram0_size and temp_dram1_used <= dram1_size: + strategy = "input_only_dram0" + dram0_allocation = temp_dram0 + dram1_allocation = temp_dram1 + dram0_used = temp_dram0_used + dram1_used = temp_dram1_used + dram0_fits = True + dram1_fits = True + + # Strategy 5: Split input ping-pong buffers across DRAMs + if not (dram0_fits and dram1_fits): + temp_dram0 = [('input_ping', input_ping), ('coeff', coeff)] + temp_dram1 = [('input_pong', input_pong), ('output_ping', output_ping), ('output_pong', output_pong), + ('bias', bias), ('outscale', outscale)] + temp_dram0_used = sum(size for _, size in temp_dram0) + temp_dram1_used = sum(size for _, size in temp_dram1) + + if temp_dram0_used <= dram0_size and temp_dram1_used <= dram1_size: + strategy = "split_input_ping_pong" + dram0_allocation = temp_dram0 + dram1_allocation = temp_dram1 + dram0_used = temp_dram0_used + dram1_used = temp_dram1_used + dram0_fits = True + dram1_fits = True + + # Strategy 6: Alternative split - input_pong+coeff in DRAM0 + if not (dram0_fits and dram1_fits): + temp_dram0 = [('input_pong', input_pong), ('coeff', coeff)] + temp_dram1 = [('input_ping', input_ping), ('output_ping', output_ping), ('output_pong', output_pong), + ('bias', bias), ('outscale', outscale)] + temp_dram0_used = sum(size for _, size in temp_dram0) + temp_dram1_used = sum(size for _, size in temp_dram1) + + if temp_dram0_used <= dram0_size and temp_dram1_used <= dram1_size: + strategy = "split_input_ping_pong_alt" + dram0_allocation = temp_dram0 + dram1_allocation = temp_dram1 + dram0_used = temp_dram0_used + dram1_used = temp_dram1_used + dram0_fits = True + dram1_fits = True + + # Build individual buffer DRAM placement mapping from allocation lists + # Check which DRAM each buffer is allocated to + dram0_buffers = {name for name, _ in dram0_allocation} + + # Map buffer names to their DRAM placement (0 or 1) + # Default to DRAM1 if not in DRAM0 + in1_dram = 0 if 'input_ping' in dram0_buffers else 1 + in2_dram = 0 if 'input_pong' in dram0_buffers else 1 + coeff_dram = 0 if 'coeff' in dram0_buffers else 1 + out1_dram = 0 if 'output_ping' in dram0_buffers else 1 + out2_dram = 0 if 'output_pong' in dram0_buffers else 1 + bias_dram = 0 if 'bias' in dram0_buffers else 1 + outscale_dram = 0 if 'outscale' in dram0_buffers else 1 + + return { + 'strategy': strategy, + 'dram0_allocation': dram0_allocation, + 'dram1_allocation': dram1_allocation, + 'dram0_used': dram0_used, + 'dram1_used': dram1_used, + 'dram0_size': dram0_size, + 'dram1_size': dram1_size, + 'dram0_free': dram0_size - dram0_used, + 'dram1_free': dram1_size - dram1_used, + 'dram0_fits': dram0_fits, + 'dram1_fits': dram1_fits, + 'total_fits': dram0_fits and dram1_fits, + # Individual buffer DRAM placement for generate_layer_configs.py + 'IN1_dram': in1_dram, + 'IN2_dram': in2_dram, + 'COEFF_dram': coeff_dram, + 'OUT1_dram': out1_dram, + 'OUT2_dram': out2_dram, + 'BIAS_dram': bias_dram, + 'OUTSCALE_dram': outscale_dram, + } + + +def print_buffer_placement(placement): + """Print buffer placement information.""" + strategy_names = { + 'default': 'Default: Input+Coeff->DRAM0, Output+Bias+Outscale->DRAM1', + 'coeff_to_dram1': 'Optimized: Coefficient moved to DRAM1', + 'bias_outscale_to_dram0': 'Optimized: Bias+Outscale moved to DRAM0', + 'input_only_dram0': 'Optimized: Only Input ping-pong in DRAM0', + 'split_input_ping_pong': 'Optimized: Input buffers split across DRAMs (ping in DRAM0, pong in DRAM1)', + 'split_input_ping_pong_alt': 'Optimized: Input buffers split across DRAMs (pong in DRAM0, ping in DRAM1)' + } + + print("\n=== Buffer Placement ===") + print(f"Strategy: {strategy_names.get(placement['strategy'], placement['strategy'])}") + print(f"DRAM0 Size: {placement['dram0_size']:6d} bytes ({placement['dram0_size']//1024}KB)") + print(f"DRAM1 Size: {placement['dram1_size']:6d} bytes ({placement['dram1_size']//1024}KB)") + + print("\nDRAM0 Allocation:") + for name, size in placement['dram0_allocation']: + print(f" {name:20s} -> {size:6d} bytes -> DRAM0") + print(f" {'Total Used':20s} {placement['dram0_used']:6d} bytes") + print(f" {'Free':20s} {placement['dram0_free']:6d} bytes") + print(f" Status: {'OK FITS' if placement['dram0_fits'] else 'X OVERFLOW'}") + + print("\nDRAM1 Allocation:") + for name, size in placement['dram1_allocation']: + print(f" {name:20s} -> {size:6d} bytes -> DRAM1") + print(f" {'Total Used':20s} {placement['dram1_used']:6d} bytes") + print(f" {'Free':20s} {placement['dram1_free']:6d} bytes") + print(f" Status: {'OK FITS' if placement['dram1_fits'] else 'X OVERFLOW'}") + + print(f"\nOverall: {'OK ALL BUFFERS FIT' if placement['total_fits'] else 'X INSUFFICIENT MEMORY'}") + + +def print_buffer_info(buffer_sizes): + """Print detailed buffer information.""" + print("\n=== Buffer Size Calculations ===") + print(f"Kernel: {buffer_sizes['kernel_name']}") + print(f"Data Type: {buffer_sizes['data_type']}") + print(f"\nBuffer Sizes:") + print(f" INPUT: {buffer_sizes['IN']:6d} bytes") + print(f" COEFF: {buffer_sizes['COEFF']:6d} bytes") + print(f" OUTPUT: {buffer_sizes['OUT']:6d} bytes") + print(f" BIAS: {buffer_sizes['BIAS']:6d} bytes") + print(f" OUTSCALE: {buffer_sizes['OUTSCALE']:6d} bytes") + print(f"\nTotal Memory: {sum([buffer_sizes['IN'], buffer_sizes['COEFF'], buffer_sizes['OUT'], buffer_sizes['BIAS'], buffer_sizes['OUTSCALE']]):6d} bytes") + + details = buffer_sizes['details'] + print(f"\nTile Parameters:") + print(f" SRC_DIM1_SIZE: {buffer_sizes['SRC_DIM1_SIZE']}") + print(f" SRC_DIM1_PITCH: {buffer_sizes['SRC_DIM1_PITCH']}") + print(f" SRC_DIM2_PITCH: {buffer_sizes['SRC_DIM2_PITCH']}") + print(f" DST_DIM1_SIZE: {buffer_sizes['DST_DIM1_SIZE']}") + print(f" DST_DIM1_PITCH: {buffer_sizes['DST_DIM1_PITCH']}") + print(f" DST_DIM2_PITCH: {buffer_sizes['DST_DIM2_PITCH']}") + print(f" DIM1_SIZE (input width): {buffer_sizes['IN_DIM1_SIZE']}") + print(f" DIM1_PITCH (with padding): {buffer_sizes['IN_DIM1_PITCH']}") + print(f" DIM2_PITCH: {buffer_sizes['IN_DIM2_PITCH']}") + print(f" IN_DATA_OFFSET: {buffer_sizes['IN_DATA_OFFSET']}") + print(f" OUT_DIM1_SIZE: {buffer_sizes['OUT_DIM1_SIZE']}") + print(f" OUT_DIM1_PITCH: {buffer_sizes['OUT_DIM1_PITCH']}") + print(f" OUT_DIM2_SIZE: {buffer_sizes['OUT_DIM2_SIZE']}") + print(f" OUT_DIM2_PITCH: {buffer_sizes['OUT_DIM2_PITCH']}") + print(f" OUT_DIM3_SIZE: {buffer_sizes['OUT_DIM3_SIZE']}") + print(f"\nDetails:") + print(f" Input buffer WHD (with padding): {details['input_buff_whd']}") + print(f" Input rows needed for 2 output rows: {details['input_rows_needed']}") + print(f" Output buffer WHD: {details['output_buff_whd']}") + + +def calculate_conv_params(n, c, h, w, oc, wc, wh, ww, oh, ow, + stride_h, stride_w, padding_h, padding_w, + dilation_h, dilation_w, groups, + in_zero_point, weight_zero_point, + bias_scale, output_scale, output_zero_point): + """ + Calculate convolution parameters including output_shift and output_scale. + + Args: + n, c, h, w: Input batch, channels, height, width + oc, wc, wh, ww: Output channels, weight channels, weight height, weight width + oh, ow: Output height, output width + stride_h, stride_w: Stride values + padding_h, padding_w: Padding values + dilation_h, dilation_w: Dilation values + groups: Number of groups for grouped convolution + in_zero_point: Input zero point + weight_zero_point: Weight zero point + bias_scale: Bias scale value + output_scale: Output scale value + output_zero_point: Output zero point + + Returns: + dict: Dictionary containing calculated conv_params + """ + # Calculate effective scale + effective_scale = bias_scale / output_scale if output_scale != 0 else 0 + + # Find the best output_shift so that outputScale fits in uint16_t + best_shift = 15 + raw_scale = int(effective_scale * (1 << best_shift)) + + if raw_scale > 65535: + # Scale too large for uint16_t, reduce shift until it fits + while best_shift > 0 and raw_scale > 65535: + best_shift -= 1 + raw_scale = int(effective_scale * (1 << best_shift)) + elif raw_scale < 16384 and best_shift < 31: + # Scale too small, increase shift for better precision + while best_shift < 31: + trial = int(effective_scale * (1 << (best_shift + 1))) + if trial > 65535: + break + best_shift += 1 + raw_scale = trial + + # Clamp to valid uint16_t range [1, 65535] + if raw_scale <= 0: + raw_scale = 1 + if raw_scale > 65535: + raw_scale = 65535 + + return { + 'strideX': stride_w, + 'strideY': stride_h, + 'accumShift': 0, # No pre-shift; keep full int32 accumulator precision + 'reluMax': 127, # Max value for int8_t output + 'outputShift': best_shift, + 'outputScale': raw_scale, + 'dilation': max(dilation_h, dilation_w), + 'kernelHeight': wh, + 'kernelWidth': ww + } + + +def main(): + """Main function with example usage.""" + # Configuration for 7x7j2d1 convolution + # Input: n=1, c=3, h=224, w=224 + # Output: oc=64, wc=3, wh=7, ww=7, oh=112, ow=112 + conv_params_7x7 = calculate_conv_params( + n=1, c=3, h=224, w=224, + oc=64, wc=3, wh=7, ww=7, + oh=112, ow=112, + stride_h=2, stride_w=2, + padding_h=3, padding_w=3, + dilation_h=1, dilation_w=1, + groups=1, + in_zero_point=0, + weight_zero_point=0, + bias_scale=1.0, + output_scale=1.0, + output_zero_point=0 + ) + + config_7x7 = { + 'input_whd': (224, 224, 3), + 'output_whd': (112, 112, 64), + 'kernel_whdn': (7, 7, 3, 64), + 'padding': (3, 3, 3, 3, 0, 0), + 'stride_xy': (2, 2), + 'kernel_name': "7x7j2d1", + 'data_type': "S8S8", + 'conv_params': ( + conv_params_7x7['strideX'], + conv_params_7x7['strideY'], + conv_params_7x7['accumShift'], + conv_params_7x7['reluMax'], + conv_params_7x7['outputShift'], + conv_params_7x7['outputScale'], + conv_params_7x7['dilation'], + conv_params_7x7['kernelHeight'], + conv_params_7x7['kernelWidth'] + ), + 'conv_flags': 0, + } + + # Configuration for 3x3j1d1 convolution + # Input: n=1, c=64, h=56, w=56 + # Output: oc=64, wc=64, wh=3, ww=3, oh=56, ow=56 + conv_params_3x3 = calculate_conv_params( + n=1, c=64, h=56, w=56, + oc=64, wc=64, wh=3, ww=3, + oh=56, ow=56, + stride_h=1, stride_w=1, + padding_h=1, padding_w=1, + dilation_h=1, dilation_w=1, + groups=1, + in_zero_point=0, + weight_zero_point=0, + bias_scale=1.0, + output_scale=1.0, + output_zero_point=0 + ) + + config_3x3 = { + 'input_whd': (56, 56, 64), + 'output_whd': (56, 56, 64), + 'kernel_whdn': (3, 3, 64, 64), + 'padding': (1, 1, 1, 1, 0, 0), + 'stride_xy': (1, 1), + 'kernel_name': "3x3j1d1", + 'data_type': "S8S8", + 'conv_params': ( + conv_params_3x3['strideX'], + conv_params_3x3['strideY'], + conv_params_3x3['accumShift'], + conv_params_3x3['reluMax'], + conv_params_3x3['outputShift'], + conv_params_3x3['outputScale'], + conv_params_3x3['dilation'], + conv_params_3x3['kernelHeight'], + conv_params_3x3['kernelWidth'] + ), + 'conv_flags': 0, + } + + # Configuration for 3x3j2d1 convolution + # Input: n=1, c=64, h=56, w=56 + # Output: oc=128, wc=64, wh=3, ww=3, oh=28, ow=28 + conv_params_3x3j2d1 = calculate_conv_params( + n=1, c=64, h=56, w=56, + oc=128, wc=64, wh=3, ww=3, + oh=28, ow=28, + stride_h=2, stride_w=2, + padding_h=1, padding_w=1, + dilation_h=1, dilation_w=1, + groups=1, + in_zero_point=0, + weight_zero_point=0, + bias_scale=1.0, + output_scale=1.0, + output_zero_point=0 + ) + + config_3x3j2d1 = { + 'input_whd': (56, 56, 64), + 'output_whd': (28, 28, 128), + 'kernel_whdn': (3, 3, 64, 128), + 'padding': (1, 1, 1, 1, 0, 0), + 'stride_xy': (2, 2), + 'kernel_name': "3x3j2d1", + 'data_type': "S8S8", + 'conv_params': ( + conv_params_3x3j2d1['strideX'], + conv_params_3x3j2d1['strideY'], + conv_params_3x3j2d1['accumShift'], + conv_params_3x3j2d1['reluMax'], + conv_params_3x3j2d1['outputShift'], + conv_params_3x3j2d1['outputScale'], + conv_params_3x3j2d1['dilation'], + conv_params_3x3j2d1['kernelHeight'], + conv_params_3x3j2d1['kernelWidth'] + ), + 'conv_flags': 0, + } + + # Configuration for 1x1j2d1 convolution + # Input: n=1, c=64, h=56, w=56 + # Output: oc=128, wc=64, wh=1, ww=1, oh=28, ow=28 + conv_params_1x1j2d1 = calculate_conv_params( + n=1, c=64, h=56, w=56, + oc=128, wc=64, wh=1, ww=1, + oh=28, ow=28, + stride_h=2, stride_w=2, + padding_h=0, padding_w=0, + dilation_h=1, dilation_w=1, + groups=1, + in_zero_point=0, + weight_zero_point=0, + bias_scale=1.0, + output_scale=1.0, + output_zero_point=0 + ) + + config_1x1j2d1 = { + 'input_whd': (56, 56, 64), + 'output_whd': (28, 28, 128), + 'kernel_whdn': (1, 1, 64, 128), + 'padding': (0, 0, 0, 0, 0, 0), + 'stride_xy': (2, 2), + 'kernel_name': "1x1j2d1", + 'data_type': "S8S8", + 'conv_params': ( + conv_params_1x1j2d1['strideX'], + conv_params_1x1j2d1['strideY'], + conv_params_1x1j2d1['accumShift'], + conv_params_1x1j2d1['reluMax'], + conv_params_1x1j2d1['outputShift'], + conv_params_1x1j2d1['outputScale'], + conv_params_1x1j2d1['dilation'], + conv_params_1x1j2d1['kernelHeight'], + conv_params_1x1j2d1['kernelWidth'] + ), + 'conv_flags': 0, + } + + # Configuration for 1x1j1d1 convolution + # Input: n=1, c=512, h=28, w=28 + # Output: oc=256, wc=512, wh=1, ww=1, oh=28, ow=28 + conv_params_1x1j1d1 = calculate_conv_params( + n=1, c=512, h=28, w=28, + oc=256, wc=512, wh=1, ww=1, + oh=28, ow=28, + stride_h=1, stride_w=1, + padding_h=0, padding_w=0, + dilation_h=1, dilation_w=1, + groups=1, + in_zero_point=0, + weight_zero_point=0, + bias_scale=1.0, + output_scale=1.0, + output_zero_point=0 + ) + + config_1x1j1d1 = { + 'input_whd': (28, 28, 512), + 'output_whd': (28, 28, 256), + 'kernel_whdn': (1, 1, 512, 256), + 'padding': (0, 0, 0, 0, 0, 0), + 'stride_xy': (1, 1), + 'kernel_name': "1x1j1d1", + 'data_type': "S8S8", + 'conv_params': ( + conv_params_1x1j1d1['strideX'], + conv_params_1x1j1d1['strideY'], + conv_params_1x1j1d1['accumShift'], + conv_params_1x1j1d1['reluMax'], + conv_params_1x1j1d1['outputShift'], + conv_params_1x1j1d1['outputScale'], + conv_params_1x1j1d1['dilation'], + conv_params_1x1j1d1['kernelHeight'], + conv_params_1x1j1d1['kernelWidth'] + ), + 'conv_flags': 0, + } + + + # Find maximum configuration for 7x7j2d1 + best_n_tile_7x7, best_out_rows_7x7, buffer_sizes_7x7 = find_max_tile_config(**config_7x7) + print_buffer_info(buffer_sizes_7x7) + + # Calculate and print buffer placement for 7x7 + placement_7x7 = calculate_buffer_placement(buffer_sizes_7x7, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1) + print_buffer_placement(placement_7x7) + + # Find maximum configuration for 3x3j1d1 + print("\n" + "="*60 + "\n") + best_n_tile_3x3, best_out_rows_3x3, buffer_sizes_3x3 = find_max_tile_config(**config_3x3) + print_buffer_info(buffer_sizes_3x3) + + # Calculate and print buffer placement for 3x3 + placement_3x3 = calculate_buffer_placement(buffer_sizes_3x3, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1) + print_buffer_placement(placement_3x3) + + # Find maximum configuration for 3x3j2d1 + print("\n" + "="*60 + "\n") + best_n_tile_3x3j2d1, best_out_rows_3x3j2d1, buffer_sizes_3x3j2d1 = find_max_tile_config(**config_3x3j2d1) + print_buffer_info(buffer_sizes_3x3j2d1) + + # Calculate and print buffer placement for 3x3j2d1 + placement_3x3j2d1 = calculate_buffer_placement(buffer_sizes_3x3j2d1, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1) + print_buffer_placement(placement_3x3j2d1) + + # Find maximum configuration for 1x1j2d1 + print("\n" + "="*60 + "\n") + best_n_tile_1x1j2d1, best_out_rows_1x1j2d1, buffer_sizes_1x1j2d1 = find_max_tile_config(**config_1x1j2d1) + print_buffer_info(buffer_sizes_1x1j2d1) + + # Calculate and print buffer placement for 1x1j2d1 + placement_1x1j2d1 = calculate_buffer_placement(buffer_sizes_1x1j2d1, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1) + print_buffer_placement(placement_1x1j2d1) + + # Find maximum configuration for 1x1j1d1 + print("\n" + "="*60 + "\n") + best_n_tile_1x1j1d1, best_out_rows_1x1j1d1, buffer_sizes_1x1j1d1 = find_max_tile_config(**config_1x1j1d1) + print_buffer_info(buffer_sizes_1x1j1d1) + + # Calculate and print buffer placement for 1x1j1d1 + placement_1x1j1d1 = calculate_buffer_placement(buffer_sizes_1x1j1d1, dram0_size=DRAM_SIZE_0, dram1_size=DRAM_SIZE_1) + print_buffer_placement(placement_1x1j1d1) + + # Generate combined header content + header_content = generate_combined_header([buffer_sizes_7x7, buffer_sizes_3x3, buffer_sizes_3x3j2d1, buffer_sizes_1x1j2d1, buffer_sizes_1x1j1d1]) + print("\n=== Generated Header Content ===") + print(header_content) + + # Write to file + output_file = r"C:\usr\xtensa\Xplorer-11.1.5-workspaces\xicnn1\test_cnn_depthwise_convolve_MOD2\test\convIdma_buffers.h" + with open(output_file, 'w') as f: + f.write(header_content) + print(f"\nHeader file written to: {output_file}") + + +def generate_combined_header(buffer_sizes_list, header_guard="CONVIDMA_BUFFERS_H_", dram0_size=None, dram1_size=None): + """ + Generate C header file content with buffer size definitions for multiple kernels. + + Args: + buffer_sizes_list: List of dictionaries from calculate_buffer_sizes() + header_guard: Header guard name + dram0_size: Size of DRAM0 in bytes (default: use global DRAM_SIZE_0) + dram1_size: Size of DRAM1 in bytes (default: use global DRAM_SIZE_1) + + Returns: + String containing header file content + """ + # Use global DRAM sizes if not specified + if dram0_size is None: + dram0_size = DRAM_SIZE_0 + if dram1_size is None: + dram1_size = DRAM_SIZE_1 + + header = f"""/* + * convIdma_buffers.h + * + * Auto-generated buffer size definitions + */ + +#ifndef {header_guard} +#define {header_guard} + + +// ============================================================================ +// Avilable DRAM Sizes for IDMA Buffers +// ============================================================================ + +#define IDMA_BUFFER_SIZE_DRAM0 ({dram0_size}) // {dram0_size // 1024} KB for DRAM0 +#define IDMA_BUFFER_SIZE_DRAM1 ({dram1_size}) // {dram1_size // 1024} KB for DRAM1 + +""" + + # Calculate placements for all kernels + placements = [calculate_buffer_placement(bs, dram0_size=dram0_size, dram1_size=dram1_size) + for bs in buffer_sizes_list] + + # Generate content for each kernel configuration + for buffer_sizes, placement in zip(buffer_sizes_list, placements): + kernel_name = buffer_sizes['kernel_name'] + data_type = buffer_sizes['data_type'] + + # Extract padding values + dim1_edge1, dim1_edge2, dim2_edge1, dim2_edge2, dim3_edge1, dim3_edge2 = ( + buffer_sizes.get('padding', (0, 0, 0, 0, 0, 0)) + ) + + header += f"""// ============================================================================ +// IDMA Buffer Sizes and Tile Parameters for convVQ3D_{kernel_name}_{data_type}_MOW_WHD +// ============================================================================ + +// SRC tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_SIZE {buffer_sizes['SRC_DIM1_SIZE']} // input width +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM1_PITCH {buffer_sizes['SRC_DIM1_PITCH']} // +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_SIZE {buffer_sizes['SRC_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM2_PITCH {buffer_sizes['SRC_DIM2_PITCH']} // {buffer_sizes['SRC_DIM1_SIZE']}*{buffer_sizes['SRC_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE {buffer_sizes['SRC_DIM3_SIZE']} + +// DST tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_SIZE {buffer_sizes['DST_DIM1_SIZE']} // input width +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM1_PITCH {buffer_sizes['DST_DIM1_PITCH']} // +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE {buffer_sizes['DST_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_PITCH {buffer_sizes['DST_DIM2_PITCH']} // {buffer_sizes['DST_DIM1_SIZE']}*{buffer_sizes['DST_DIM2_SIZE']} + + +// Input tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_SIZE {buffer_sizes['IN_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_PITCH {buffer_sizes['IN_DIM1_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE {buffer_sizes['IN_DIM2_SIZE']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE + ((IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE-1)* stride) +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_PITCH {buffer_sizes['IN_DIM2_PITCH']} + +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_EDGE1 {dim1_edge1} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_EDGE2 {dim1_edge2} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_EDGE1 {dim2_edge1} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_EDGE2 {dim2_edge2} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_EDGE1 {dim3_edge1} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_EDGE2 {dim3_edge2} + + +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DATA_OFFSET {buffer_sizes['IN_DATA_OFFSET']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_ROWS_FIRSTDMA {buffer_sizes['IN_ROWS_FIRSTDMA']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_IN_DIM2_SIZE - padding rows +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_FRAME_PTR 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_STATUS_FLAGS 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM1_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN_DIM3_COORD 0 + +// Output tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_SIZE {buffer_sizes['OUT_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_PITCH {buffer_sizes['OUT_DIM1_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE {buffer_sizes['OUT_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_PITCH {buffer_sizes['OUT_DIM2_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_SIZE {buffer_sizes['N_TILE_SIZE']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_FRAME_PTR 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_STATUS_FLAGS 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM1_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM3_EDGE2 0 + +//coefficient tile parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_SIZE {buffer_sizes['COEFF_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE {buffer_sizes['COEFF_DIM2_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_SIZE {buffer_sizes['COEFF_DIM3_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE {buffer_sizes['COEFF_DIM4_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_PITCH {buffer_sizes['COEFF_DIM1_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_PITCH {buffer_sizes['COEFF_DIM2_PITCH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_PITCH {buffer_sizes['COEFF_DIM3_PITCH']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_FRAME_PTR 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_STATUS_FLAGS 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_COORD 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE1 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_EDGE2 0 +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_COORD 0 + +//bias array parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM1_SIZE {buffer_sizes['BIAS_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_BIAS_DIM2_SIZE {buffer_sizes['BIAS_DIM2_SIZE']} + +//output scale array parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM1_SIZE {buffer_sizes['OUTSCALE_DIM1_SIZE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DIM2_SIZE {buffer_sizes['OUTSCALE_DIM2_SIZE']} + +// Buffer sizes +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN1 {buffer_sizes['IN']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN2 {buffer_sizes['IN']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF {buffer_sizes['COEFF']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT1 {buffer_sizes['OUT']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT2 {buffer_sizes['OUT']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_BIAS {buffer_sizes['BIAS']} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE {buffer_sizes['OUTSCALE']} + +""" + + # Generate DRAM placement macros based on optimization strategy + dram_map = {} + for name, size in placement['dram0_allocation']: + dram_map[name] = 0 + for name, size in placement['dram1_allocation']: + dram_map[name] = 1 + + # Map buffer names to macro names + header += f"""#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN1_DRAM {dram_map.get('input_ping', 0)} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_IN2_DRAM {dram_map.get('input_pong', 0)} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_COEFF_DRAM {dram_map.get('coeff', 0)} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT1_DRAM {dram_map.get('output_ping', 1)} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUT2_DRAM {dram_map.get('output_pong', 1)} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_BIAS_DRAM {dram_map.get('bias', 1)} +#define IDMA_BUFF_{kernel_name}_{data_type}_MOW_WHD_OUTSCALE_DRAM {dram_map.get('outscale', 1)} + +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILES {buffer_sizes['N_TILES']} // round_toward positive(IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_SRC_DIM3_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE) +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_HIGHT_TILES {buffer_sizes['HIGHT_TILES']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DST_DIM2_SIZE / IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUT_DIM2_SIZE +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE {buffer_sizes['N_TILE_SIZE']} // take this as input aas of now (contstant 22 for 3x3 conv and constant 64 for 7x7 conv) +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE_LAST {buffer_sizes['N_TILE_SIZE_LAST']} //IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM4_SIZE - IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_TILE_SIZE_LAST {buffer_sizes['COEFF_TILE_SIZE_LAST']} // IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM1_SIZE * IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM2_SIZE * IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_COEFF_DIM3_SIZE * IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_N_TILE_SIZE_LAST +""" + + # Add convolution parameters if available + if 'STRIDEX' in buffer_sizes: + header += f""" +// Convolution parameters +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_STRIDEX {buffer_sizes['STRIDEX']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_STRIDEY {buffer_sizes['STRIDEY']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_ACCUM_SHIFT {buffer_sizes['ACCUM_SHIFT']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_RELU_MAX {buffer_sizes['RELU_MAX']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_RELU_MIN {buffer_sizes['RELU_MIN']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTPUT_SHIFT {buffer_sizes['OUTPUT_SHIFT']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_OUTPUT_SCALE {buffer_sizes['OUTPUT_SCALE']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_DILATION {buffer_sizes['DILATION']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_KERNEL_HEIGHT {buffer_sizes['KERNEL_HEIGHT']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_KERNEL_WIDTH {buffer_sizes['KERNEL_WIDTH']} +#define IDMA_CONV_{kernel_name}_{data_type}_MOW_WHD_FLAGS {buffer_sizes['FLAGS']} +""" + + header += """ + + +""" + + header += f"#endif /* {header_guard} */\n" + return header + + +if __name__ == "__main__": + main() diff --git a/backends/cadence/vision/config_generator/generate_layer_configs.py b/backends/cadence/vision/config_generator/generate_layer_configs.py new file mode 100644 index 00000000000..65459653ec5 --- /dev/null +++ b/backends/cadence/vision/config_generator/generate_layer_configs.py @@ -0,0 +1,1158 @@ +#!/usr/bin/env python3 +""" +Generate buffer configuration lookup table from layer configurations + +This script extracts conv2d layers directly from PyTorch models (or reads +from .csv/.json) and: +1. Extracts unique conv2d layer parameters via forward hooks +2. Calculates optimal buffer sizes and tiling for each layer +3. Generates a C lookup table with all configurations +4. Outputs conv_layer_configs.h for runtime use + +Usage: + # Direct from model (no CSV needed): + python generate_layer_configs.py --model resnet18 --output conv_layer_configs.h --dram0 64000 --dram1 64000 + python generate_layer_configs.py --model resnet50 --output conv_layer_configs.h --dram0 64000 --dram1 64000 + python generate_layer_configs.py --model resnet18+resnet50 --output conv_layer_configs.h --dram0 64000 --dram1 64000 + + # From existing CSV + python generate_layer_configs.py resnet_conv_list.csv --output conv_layer_configs.h --dram0 64000 --dram1 64000 + + # From .pte extraction JSON + python generate_layer_configs.py layers_config.json --dram0 32768 --dram1 32768 + + # Generate all configs in no-DMA mode (changes _dma suffix to _no_dma for every kernel name) + python generate_layer_configs.py resnet_conv_list.csv --output conv_layer_configs_no_dma.h --dram0 64000 --dram1 64000 --no-dma-mode +""" + +import os +import sys +import json +import argparse +from pathlib import Path +from collections import OrderedDict + +# Import the existing buffer calculation logic +sys.path.insert(0, str(Path(__file__).parent)) +from generate_idma_buffers import ( + find_max_tile_config, + calculate_buffer_sizes_with_rows, + calculate_buffer_placement, + DRAM_SIZE_0, + DRAM_SIZE_1 +) + +# --------------------------------------------------------------------------- +# Direct model extraction (replaces extract_resnet_layers.py) +# --------------------------------------------------------------------------- + +SUPPORTED_MODELS = ['resnet18', 'resnet50'] + + +def _build_name_map_resnet18(): + """ResNet-18 (BasicBlock): 2 conv layers per block, 2 blocks per layer group.""" + m = OrderedDict() + m['conv1'] = 'conv1' + m['layer1.0.conv1'] = 'conv2.1' + m['layer1.0.conv2'] = 'conv2.2' + m['layer1.1.conv1'] = 'conv3.1' + m['layer1.1.conv2'] = 'conv3.2' + m['layer2.0.downsample.0'] = 'conv4a.1' + m['layer2.0.conv1'] = 'conv4b.1' + m['layer2.0.conv2'] = 'conv4b.2' + m['layer2.1.conv1'] = 'conv5.1' + m['layer2.1.conv2'] = 'conv5.2' + m['layer3.0.downsample.0'] = 'conv6a.1' + m['layer3.0.conv1'] = 'conv6b.1' + m['layer3.0.conv2'] = 'conv6b.2' + m['layer3.1.conv1'] = 'conv7.1' + m['layer3.1.conv2'] = 'conv7.2' + m['layer4.0.downsample.0'] = 'conv8a.1' + m['layer4.0.conv1'] = 'conv8b.1' + m['layer4.0.conv2'] = 'conv8b.2' + m['layer4.1.conv1'] = 'conv9.1' + m['layer4.1.conv2'] = 'conv9.2' + return m + + +def _build_name_map_resnet50(): + """ResNet-50 (Bottleneck): 3 conv layers per block, variable blocks per layer group.""" + m = OrderedDict() + m['conv1'] = 'conv1' + layer_blocks = {1: 3, 2: 4, 3: 6, 4: 3} + conv_counter = 2 + for layer_idx in range(1, 5): + n_blocks = layer_blocks[layer_idx] + for blk in range(n_blocks): + prefix = f'layer{layer_idx}.{blk}' + has_ds = (blk == 0) + if has_ds: + m[f'{prefix}.downsample.0'] = f'conv{conv_counter}a.1' + m[f'{prefix}.conv1'] = f'conv{conv_counter}b.1' + m[f'{prefix}.conv2'] = f'conv{conv_counter}b.2' + m[f'{prefix}.conv3'] = f'conv{conv_counter}b.3' + else: + m[f'{prefix}.conv1'] = f'conv{conv_counter}.1' + m[f'{prefix}.conv2'] = f'conv{conv_counter}.2' + m[f'{prefix}.conv3'] = f'conv{conv_counter}.3' + conv_counter += 1 + return m + + +def _get_conv_layers_via_hooks(model, name_map, input_size=(1, 3, 64, 64)): + """ + Run forward hooks on every Conv2d layer to capture input/output shapes + and convolution parameters. Returns OrderedDict keyed by friendly name. + """ + import torch + import torch.nn as nn + + layers_info = OrderedDict() + hooks = [] + + def make_hook(friendly_name): + def hook_fn(module, inp, out): + layers_info[friendly_name] = { + 'input': list(inp[0].shape), + 'kernel': list(module.weight.shape), + 'stride': list(module.stride), + 'padding': list(module.padding), + 'dilation': list(module.dilation), + 'transposed': isinstance(module, nn.ConvTranspose2d), + 'output_padding': (list(module.output_padding) + if hasattr(module, 'output_padding') else [0, 0]), + 'groups': module.groups, + 'output': list(out.shape), + } + return hook_fn + + for mod_name, module in model.named_modules(): + if mod_name in name_map and isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)): + hooks.append(module.register_forward_hook(make_hook(name_map[mod_name]))) + + x = torch.randn(*input_size) + with torch.no_grad(): + model.eval() + model(x) + + for h in hooks: + h.remove() + return layers_info + + +def _make_unique_key(info): + """Hashable key for deduplication across models.""" + return ( + tuple(info['input']), + tuple(info['kernel']), + tuple(info['stride']), + tuple(info['padding']), + tuple(info['dilation']), + info['transposed'], + tuple(info['output_padding']), + info['groups'], + tuple(info['output']), + ) + + +def load_layers_from_model(model_names, input_size=(1, 3, 64, 64)): + """ + Extract unique conv2d layers directly from one or more torchvision models. + + Args: + model_names: list of model name strings, e.g. ['resnet18', 'resnet50'] + input_size: tuple for the dummy forward pass, e.g. (1, 3, 64, 64) + + Returns: + list of layer dicts in the internal format expected by calculate_layer_config() + """ + import torch # noqa: deferred import so torch is only needed when --model is used + + builders = { + 'resnet18': ('torchvision.models', 'resnet18', 'ResNet18_Weights', _build_name_map_resnet18), + 'resnet50': ('torchvision.models', 'resnet50', 'ResNet50_Weights', _build_name_map_resnet50), + } + + seen_keys = set() + unique_layers = [] # (friendly_name, info_dict, source_model) + + for model_name in model_names: + model_name = model_name.strip().lower() + if model_name not in builders: + raise ValueError(f"Unsupported model '{model_name}'. Supported: {list(builders.keys())}") + + mod_path, fn_name, wt_name, name_map_fn = builders[model_name] + print(f"Loading {model_name}...") + import importlib + tv = importlib.import_module(mod_path) + build_fn = getattr(tv, fn_name) + weights = getattr(tv, wt_name).DEFAULT + model = build_fn(weights=weights) + model.eval() + + name_map = name_map_fn() + layers_info = _get_conv_layers_via_hooks(model, name_map, input_size) + + for name, info in layers_info.items(): + key = _make_unique_key(info) + if key not in seen_keys: + seen_keys.add(key) + unique_layers.append((name, info, model_name)) + + print(f"Extracted {len(unique_layers)} unique conv layers from {', '.join(model_names)}") + + # Convert to the internal layer-dict format used by calculate_layer_config() + layers = [] + for layer_id, (name, info, _source) in enumerate(unique_layers): + _, in_c, in_h, in_w = info['input'] + _, out_c, out_h, out_w = info['output'] + _oc, in_channels, k_h, k_w = info['kernel'] + layers.append({ + 'layer_id': layer_id, + 'name': name, + 'input': (in_w, in_h, in_c), + 'output': (out_w, out_h, out_c), + 'kernel': (k_w, k_h, in_channels, _oc), + 'stride': tuple(info['stride']), + 'padding': tuple(info['padding']), + 'dilation': tuple(info['dilation']), + }) + return layers + + +# --------------------------------------------------------------------------- +# PTE-based loader (ExecuTorch .pte binary via exir source tree) +# --------------------------------------------------------------------------- + +# Default paths relative to this script's location +# backends/cadence/vision/config_generator/ → .parent×5 → ext_test/executorch +_EXECUTORCH_SRC = str(Path(__file__).parent.parent.parent.parent.parent) # ext_test/executorch +_EXECUTORCH_PARENT = str(Path(__file__).parent.parent.parent.parent.parent.parent) # ext_test +_FLATC_DEFAULT = str(Path(__file__).parent.parent.parent.parent.parent / + "cmake-out/third-party/flatc_ep/bin/flatc") + + +def _bootstrap_executorch_imports(flatc_path=None): + """ + Bootstrap executorch.exir from the local source tree without a pip install. + + Bypasses exir/__init__.py (which pulls in many optional deps) by pre-populating + sys.modules with lightweight stub packages for 'executorch' and 'executorch.exir'. + Only the _serialize sub-package is actually loaded. + + Also sets FLATC_EXECUTABLE so _flatbuffer.py can find the flatc binary. + """ + import types + + # Add ext_test/ so `import executorch…` works, and ext_test/executorch/ so + # internal sub-imports like `from executorch.exir._serialize…` resolve correctly. + if _EXECUTORCH_PARENT not in sys.path: + sys.path.insert(0, _EXECUTORCH_PARENT) + if _EXECUTORCH_SRC not in sys.path: + sys.path.insert(0, _EXECUTORCH_SRC) + + # Stub 'executorch' and 'executorch.exir' so Python never runs their + # __init__.py files (which have heavy, optional dependencies). + for pkg, pkg_dir in [ + ('executorch', _EXECUTORCH_SRC), + ('executorch.exir', _EXECUTORCH_SRC + '/exir'), + ]: + if pkg not in sys.modules: + m = types.ModuleType(pkg) + m.__path__ = [pkg_dir] + m.__package__ = pkg + sys.modules[pkg] = m + + # Tell _flatbuffer.py where to find the flatc binary. + resolved = flatc_path or _FLATC_DEFAULT + if os.path.isfile(resolved): + os.environ.setdefault('FLATC_EXECUTABLE', resolved) + + +def load_layers_from_pte(pte_file, flatc_path=None): + """ + Extract unique conv2d layers directly from an ExecuTorch .pte binary. + + Mirrors load_layers_from_model() but reads the serialised execution plan + instead of running a live forward pass. Works without a full executorch + pip install by loading the _serialize sub-package from the local source + tree. + + Args: + pte_file: Path to the .pte file (str or Path). + flatc_path: Optional path to the flatc binary. Defaults to the + cmake-out copy built alongside the source tree. + + Returns: + list of layer dicts in the internal format expected by + calculate_layer_config(), same as load_layers_from_model(). + """ + _bootstrap_executorch_imports(flatc_path) + + from executorch.exir._serialize._program import deserialize_pte_binary + from executorch.exir.schema import KernelCall, Int, IntList, Tensor + + pte_path = Path(pte_file) + print(f"Loading PTE: {pte_path} ...") + + with open(pte_path, 'rb') as f: + pte_file_obj = deserialize_pte_binary(f.read()) + + # deserialize_pte_binary returns a PTEFile wrapper; unwrap to get Program + if hasattr(pte_file_obj, 'program'): + program = pte_file_obj.program + else: + program = pte_file_obj # older API returned Program directly + + plan = program.execution_plan[0] + values = plan.values + + # ------------------------------------------------------------------ + # Helpers to dereference EValue indices from the values table + # ------------------------------------------------------------------ + def _tensor(idx): + v = values[idx].val + return v if isinstance(v, Tensor) else None + + def _int_val(idx): + v = values[idx].val + return v.int_val if isinstance(v, Int) else None + + def _intlist_val(idx): + """IntList.items are EValue indices pointing to Int EVals.""" + v = values[idx].val + if isinstance(v, IntList): + return [_int_val(i) for i in v.items] + return None + + # ------------------------------------------------------------------ + # Walk all KernelCall instructions and collect conv layers + # ------------------------------------------------------------------ + # cadence::quantized_conv2d_nchw arg order (from quantized_conv2d_nchw_out.cpp): + # [0] input [1] weight [2] bias + # [3] stride [4] padding [5] dilation + # [6] groups [7] in_zero_point … [−2/−1] out + CONV_OPS = { + 'cadence::quantized_conv2d_nchw', + 'aten::conv2d', + 'aten::convolution', + } + + seen_keys = set() + unique_layers = [] + + for instr in plan.chains[0].instructions: + ia = instr.instr_args + if not isinstance(ia, KernelCall): + continue + op_name = plan.operators[ia.op_index].name + if op_name not in CONV_OPS: + continue + + args = ia.args + input_t = _tensor(args[0]) + weight_t = _tensor(args[1]) + output_t = _tensor(args[-1]) # last arg is always the output tensor + + if input_t is None or weight_t is None or output_t is None: + continue + + stride = _intlist_val(args[3]) or [1, 1] + padding = _intlist_val(args[4]) or [0, 0] + dilation = _intlist_val(args[5]) or [1, 1] + + # shapes are NCHW + _, in_c, in_h, in_w = input_t.sizes + _, out_c, out_h, out_w = output_t.sizes + _oc, _ic, k_h, k_w = weight_t.sizes + + info = { + 'input': (in_w, in_h, in_c), + 'output': (out_w, out_h, out_c), + 'kernel': (k_w, k_h, _ic, _oc), + 'stride': tuple(stride), + 'padding': tuple(padding), + 'dilation':tuple(dilation), + } + + key = (info['input'], info['output'], info['kernel'], + info['stride'], info['padding'], info['dilation']) + if key not in seen_keys: + seen_keys.add(key) + unique_layers.append(info) + + print(f"Extracted {len(unique_layers)} unique conv layers from PTE") + + # Convert to the internal layer-dict format (same as load_layers_from_model) + layers = [] + for layer_id, info in enumerate(unique_layers): + in_w, in_h, in_c = info['input'] + out_w, out_h, out_c = info['output'] + k_w, k_h, _ic, _oc = info['kernel'] + # Derive a friendly name from the kernel shape + name = f"conv_{k_h}x{k_w}_s{info['stride'][0]}_ic{in_c}_oc{out_c}" + layers.append({ + 'layer_id': layer_id, + 'name': name, + 'input': info['input'], + 'output': info['output'], + 'kernel': info['kernel'], + 'stride': info['stride'], + 'padding': info['padding'], + 'dilation': info['dilation'], + }) + return layers + + +# --------------------------------------------------------------------------- +# File-based loaders (CSV / JSON) +# --------------------------------------------------------------------------- + +def load_layers_from_json(json_file): + """Load layer configurations from JSON file""" + with open(json_file, 'r') as f: + return json.load(f) + +def load_layers_from_csv(csv_file): + """Load layer configurations from ResNet CSV file (tab-delimited)""" + import csv + + layers = [] + with open(csv_file, 'r') as f: + reader = csv.reader(f, delimiter='\t') + layer_id = 0 + + for row in reader: + # Skip header or empty rows + if not row or not row[0].strip() or 'input' in row[0].lower() or (len(row) > 1 and 'input' in row[1].lower()): + continue + + # Tab-delimited format: layer_name \t input \t kernel \t stride \t padding \t dilation \t transposed \t output_padding \t groups \t output + layer_name = row[0].strip() + + # Parse shapes from CSV + input_shape = tuple(int(x) for x in row[1].strip().split(',')) # e.g., "1,3,64,64" + kernel_shape = tuple(int(x) for x in row[2].strip().split(',')) # e.g., "64,3,7,7" + stride = tuple(int(x) for x in row[3].strip().split(',')) # e.g., "2, 2" + padding = tuple(int(x) for x in row[4].strip().split(',')) if len(row) > 4 else (0, 0) + output_shape = tuple(int(x) for x in row[9].strip().split(',')) # e.g., "1,64,32,32" + + # Convert to internal format + _, in_c, in_h, in_w = input_shape + _, out_c, out_h, out_w = output_shape + out_channels, in_channels, k_h, k_w = kernel_shape + + layer = { + 'layer_id': layer_id, + 'name': layer_name, + 'input': (in_w, in_h, in_c), + 'output': (out_w, out_h, out_c), + 'kernel': (k_w, k_h, in_channels, out_channels), + 'stride': tuple(stride), + 'padding': tuple(padding), + 'dilation': (1, 1) + } + + layers.append(layer) + layer_id += 1 + + return layers + +def calculate_layer_config(layer, dram0_size, dram1_size): + """ + Calculate complete buffer configuration for a single layer + + Returns: Dictionary with all runtime parameters + """ + # Unpack layer parameters + input_w, input_h, input_c = layer['input'] + output_w, output_h, output_c = layer['output'] + kernel_w, kernel_h, in_c, out_c = layer['kernel'] + stride_w, stride_h = layer['stride'] + pad_w, pad_h = layer['padding'] + + # Calculate padding edges + padding = (pad_w, pad_w, pad_h, pad_h, 0, 0) # (dim1_e1, dim1_e2, dim2_e1, dim2_e2, ...) + + # Dummy conv_params (will be set per-model) + conv_params = (stride_w, stride_h, 8, 4000, 11, 0, 1, kernel_h, kernel_w) + + # Generate kernel name based on size and stride + if kernel_h == 7 and kernel_w == 7 and stride_h == 2: + kernel_name = "7x7j2d1" + elif kernel_h == 3 and kernel_w == 3 and stride_h == 1: + kernel_name = "3x3j1d1" + elif kernel_h == 3 and kernel_w == 3 and stride_h == 2: + kernel_name = "3x3j2d1" + elif kernel_h == 1 and kernel_w == 1 and stride_h == 2: + kernel_name = "1x1j2d1" + elif kernel_h == 1 and kernel_w == 1 and stride_h == 1: + kernel_name = "1x1j1d1" + else: + kernel_name = f"{kernel_w}x{kernel_h}j{stride_w}d1" + + # Find optimal tiling configuration + n_tile_size, output_rows, buffer_sizes = find_max_tile_config( + input_whd=(input_w, input_h, input_c), + output_whd=(output_w, output_h, output_c), + kernel_whdn=(kernel_w, kernel_h, in_c, out_c), + padding=padding, + stride_xy=(stride_w, stride_h), + kernel_name=kernel_name, + data_type="S8S8", + dram0_size=dram0_size, + dram1_size=dram1_size, + conv_params=conv_params + ) + + if buffer_sizes is None or n_tile_size == 0 or output_rows == 0: + print(f"WARNING: Could not find valid DMA configuration for layer {layer['layer_id']} - using cache mode (single tile)") + + # Calculate pitches with padding for cache mode + # in_dim1_size = src_dim1_size (actual input width) + # in_dim1_pitch = input_w + 2*pad_w (width including padding) + in_dim1_pitch = input_w + 2 * pad_w + in_dim2_pitch = in_dim1_pitch * (input_h + 2 * pad_h) + out_dim1_pitch = output_w + out_dim2_pitch = out_dim1_pitch * output_h + coeff_dim1_pitch = kernel_w + coeff_dim2_pitch = coeff_dim1_pitch * kernel_h + coeff_dim3_pitch = coeff_dim2_pitch * in_c + + # Calculate buffer sizes for full tile (no tiling - process entire layer) + input_buffer_size = in_dim2_pitch * input_c + output_buffer_size = out_dim2_pitch * output_c + coeff_buffer_size = coeff_dim3_pitch * output_c + bias_buffer_size = output_c * 4 # S32 + outscale_buffer_size = output_c * 2 # U16 + + # Data offset is 0 for cache mode (no pre-allocated padding in buffer) + in_data_offset = 0 + + # Return cache-mode config: single tile processing entire layer + return { + 'layer_id': layer['layer_id'], + 'layer_name': layer['name'], + 'kernel_name': kernel_name + "_no_dma", + 'src_dim1_size': input_w, 'src_dim2_size': input_h, 'src_dim3_size': input_c, + 'src_dim1_pitch': input_w, 'src_dim2_pitch': input_w * input_h, + 'dst_dim1_size': output_w, 'dst_dim2_size': output_h, 'dst_dim3_size': output_c, + 'dst_dim1_pitch': output_w, 'dst_dim2_pitch': output_w * output_h, + 'in_dim1_size': input_w, 'in_dim1_pitch': in_dim1_pitch, + 'in_dim2_size': input_h, 'in_dim2_pitch': in_dim2_pitch, + 'in_dim1_edge1': pad_w, 'in_dim1_edge2': pad_w, 'in_dim2_edge1': pad_h, 'in_dim2_edge2': pad_h, + 'in_dim3_edge1': 0, 'in_dim3_edge2': 0, 'in_data_offset': in_data_offset, 'in_rows_firstdma': input_h, + 'out_dim1_size': output_w, 'out_dim1_pitch': out_dim1_pitch, + 'out_dim2_size': output_h, 'out_dim2_pitch': out_dim2_pitch, 'out_dim3_size': output_c, + 'coeff_dim1_size': kernel_w, 'coeff_dim2_size': kernel_h, 'coeff_dim3_size': in_c, 'coeff_dim4_size': output_c, + 'coeff_dim1_pitch': coeff_dim1_pitch, 'coeff_dim2_pitch': coeff_dim2_pitch, 'coeff_dim3_pitch': coeff_dim3_pitch, + 'bias_dim1_size': output_c, 'bias_dim2_size': 1, + 'outscale_dim1_size': output_c, 'outscale_dim2_size': 1, + 'input_buffer_size': input_buffer_size, 'coeff_buffer_size': coeff_buffer_size, + 'output_buffer_size': output_buffer_size, + 'bias_buffer_size': bias_buffer_size, 'outscale_buffer_size': outscale_buffer_size, + 'input_ping_dram': 0, 'input_pong_dram': 0, 'coeff_dram': 0, + 'output_ping_dram': 0, 'output_pong_dram': 0, 'bias_dram': 0, 'outscale_dram': 0, + 'n_tile_size': output_c, 'n_tiles': 1, 'n_tile_size_last': output_c, 'height_tiles': 1, + 'output_rows': output_h, 'input_rows': input_h, + 'stride_x': stride_w, 'stride_y': stride_h, 'accum_shift': 8, 'relu_max': 4000, + 'relu_min': 0, 'output_shift': 11, 'output_scale': 0, 'dilation': 1, + 'kernel_w': kernel_w, 'kernel_h': kernel_h, 'padding': pad_w, 'flags': 0, + 'input_zero_point': 0, + # Generate unique config key: ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil + 'config_key': f"{input_c}_{input_h}_{input_w}_{output_c}_{kernel_h}_{kernel_w}_{output_h}_{output_w}_{stride_h}_{stride_w}_{pad_w}_1", + } + + # Calculate additional derived parameters + n_tiles = (out_c + n_tile_size - 1) // n_tile_size + height_tiles = (output_h + output_rows - 1) // output_rows + input_rows = kernel_h + (output_rows - 1) * stride_h + + # Get buffer placement + placement = calculate_buffer_placement(buffer_sizes, dram0_size, dram1_size) + + # Build complete config with all fields from convIdma_buffers.h schema + config = { + 'layer_id': layer['layer_id'], + 'layer_name': layer['name'], + 'kernel_name': kernel_name + "_dma", + + # Source dimensions + 'src_dim1_size': buffer_sizes['SRC_DIM1_SIZE'], + 'src_dim2_size': buffer_sizes['SRC_DIM2_SIZE'], + 'src_dim3_size': buffer_sizes['SRC_DIM3_SIZE'], + 'src_dim1_pitch': buffer_sizes['SRC_DIM1_PITCH'], + 'src_dim2_pitch': buffer_sizes['SRC_DIM2_PITCH'], + + # Destination dimensions + 'dst_dim1_size': buffer_sizes['DST_DIM1_SIZE'], + 'dst_dim2_size': buffer_sizes['DST_DIM2_SIZE'], + 'dst_dim1_pitch': buffer_sizes['DST_DIM1_PITCH'], + 'dst_dim2_pitch': buffer_sizes['DST_DIM2_PITCH'], + 'dst_dim3_size': output_c, + + # Input tile dimensions + 'in_dim1_size': buffer_sizes['IN_DIM1_SIZE'], + 'in_dim1_pitch': buffer_sizes['IN_DIM1_PITCH'], + 'in_dim2_size': buffer_sizes['IN_DIM2_SIZE'], + 'in_dim2_pitch': buffer_sizes['IN_DIM2_PITCH'], + 'in_dim1_edge1': padding[0], + 'in_dim1_edge2': padding[1], + 'in_dim2_edge1': padding[2], + 'in_dim2_edge2': padding[3], + 'in_dim3_edge1': padding[4], + 'in_dim3_edge2': padding[5], + 'in_data_offset': buffer_sizes['IN_DATA_OFFSET'], + 'in_rows_firstdma': buffer_sizes['IN_ROWS_FIRSTDMA'], + + # Output tile dimensions + 'out_dim1_size': buffer_sizes['OUT_DIM1_SIZE'], + 'out_dim1_pitch': buffer_sizes['OUT_DIM1_PITCH'], + 'out_dim2_size': buffer_sizes['OUT_DIM2_SIZE'], + 'out_dim2_pitch': buffer_sizes['OUT_DIM2_PITCH'], + 'out_dim3_size': buffer_sizes['OUT_DIM3_SIZE'], + + # Coefficient tile dimensions + 'coeff_dim1_size': buffer_sizes['COEFF_DIM1_SIZE'], + 'coeff_dim2_size': buffer_sizes['COEFF_DIM2_SIZE'], + 'coeff_dim3_size': buffer_sizes['COEFF_DIM3_SIZE'], + 'coeff_dim4_size': buffer_sizes['COEFF_DIM4_SIZE'], + 'coeff_dim1_pitch': buffer_sizes['COEFF_DIM1_PITCH'], + 'coeff_dim2_pitch': buffer_sizes['COEFF_DIM2_PITCH'], + 'coeff_dim3_pitch': buffer_sizes['COEFF_DIM3_PITCH'], + + # Bias dimensions + 'bias_dim1_size': buffer_sizes['BIAS_DIM1_SIZE'], + 'bias_dim2_size': buffer_sizes['BIAS_DIM2_SIZE'], + + # Output scale dimensions + 'outscale_dim1_size': buffer_sizes['OUTSCALE_DIM1_SIZE'], + 'outscale_dim2_size': buffer_sizes['OUTSCALE_DIM2_SIZE'], + + # Buffer sizes + 'input_buffer_size': buffer_sizes['IN'], + 'coeff_buffer_size': buffer_sizes['COEFF'], + 'output_buffer_size': buffer_sizes['OUT'], + 'bias_buffer_size': buffer_sizes['BIAS'], + 'outscale_buffer_size': buffer_sizes['OUTSCALE'], + + # Buffer DRAM placement (0 or 1) + 'input_ping_dram': placement.get('IN1_dram', 0), + 'input_pong_dram': placement.get('IN2_dram', 1), + 'coeff_dram': placement.get('COEFF_dram', 0), + 'output_ping_dram': placement.get('OUT1_dram', 1), + 'output_pong_dram': placement.get('OUT2_dram', 1), + 'bias_dram': placement.get('BIAS_dram', 1), + 'outscale_dram': placement.get('OUTSCALE_dram', 1), + + # Tiling parameters + 'n_tile_size': buffer_sizes['N_TILE_SIZE'], + 'n_tiles': buffer_sizes['N_TILES'], + 'n_tile_size_last': buffer_sizes['N_TILE_SIZE_LAST'], + 'height_tiles': buffer_sizes['HIGHT_TILES'], + 'output_rows': output_rows, + 'input_rows': input_rows, + + # Convolution parameters + 'stride_x': buffer_sizes.get('STRIDEX', stride_w), + 'stride_y': buffer_sizes.get('STRIDEY', stride_h), + 'accum_shift': buffer_sizes.get('ACCUM_SHIFT', 8), + 'relu_max': buffer_sizes.get('RELU_MAX', 4000), + 'relu_min': buffer_sizes.get('RELU_MIN', 0), + 'output_shift': buffer_sizes.get('OUTPUT_SHIFT', 11), + 'output_scale': buffer_sizes.get('OUTPUT_SCALE', 0), + 'dilation': buffer_sizes.get('DILATION', 1), + 'kernel_w': kernel_w, + 'kernel_h': kernel_h, + 'padding': pad_w, # Symmetric padding + 'flags': buffer_sizes.get('FLAGS', 0), + 'input_zero_point': 0, + } + + # Generate unique config key based on layer parameters + # Format: ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil + dilation = buffer_sizes.get('DILATION', 1) + config['config_key'] = f"{in_c}_{input_h}_{input_w}_{out_c}_{kernel_h}_{kernel_w}_{output_h}_{output_w}_{stride_h}_{stride_w}_{pad_w}_{dilation}" + + return config + +def generate_c_header(configs, output_file, dram0_size=32768, dram1_size=32768, no_dma_mode=False): + """ + Generate C header file with lookup table + + Output: conv_layer_configs.h with: + - typedef struct conv_layer_config_t + - const conv_layer_config_t CONV_LAYER_CONFIGS[] = {...}; + - int get_num_conv_layers(); + - const conv_layer_config_t* get_layer_config(int layer_id); + """ + + with open(output_file, 'w') as f: + f.write("""/* + * conv_layer_configs.h + * + * Auto-generated convolution layer configurations + * Generated from model layer extraction + * + * DO NOT EDIT MANUALLY - Regenerate with generate_layer_configs.py + */ + +#ifndef CONV_LAYER_CONFIGS_H +#define CONV_LAYER_CONFIGS_H + +#include +#include // for NULL + +/** + * Runtime configuration for a single convolution layer + * Contains all parameters needed to execute the layer + * Matches convIdma_buffers.h schema + */ +typedef struct { + // Layer identification + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; // Unique key: ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil + + // Source (DRAM) dimensions + int src_dim1_size; // Input width in DRAM + int src_dim2_size; // Input height in DRAM + int src_dim3_size; // Input channels in DRAM + int src_dim1_pitch; // DRAM row pitch + int src_dim2_pitch; // DRAM plane pitch + + // Destination (DRAM) dimensions + int dst_dim1_size; // Output width in DRAM + int dst_dim2_size; // Output height in DRAM + int dst_dim3_size; // Output channels in DRAM + int dst_dim1_pitch; // DRAM row pitch + int dst_dim2_pitch; // DRAM plane pitch + + // Input tile (local memory) dimensions + int in_dim1_size; // Tile width (with padding) + int in_dim1_pitch; // Tile row pitch + int in_dim2_size; // Tile height (rows per iteration) + int in_dim2_pitch; // Tile plane pitch + int in_dim1_edge1; // Left padding + int in_dim1_edge2; // Right padding + int in_dim2_edge1; // Top padding + int in_dim2_edge2; // Bottom padding + int in_dim3_edge1; // Channel padding (usually 0) + int in_dim3_edge2; // Channel padding (usually 0) + int in_data_offset; // Offset to actual data in buffer + int in_rows_firstdma; // Rows to transfer in first DMA + + // Output tile (local memory) dimensions + int out_dim1_size; // Output width + int out_dim1_pitch; // Output row pitch + int out_dim2_size; // Output rows per iteration + int out_dim2_pitch; // Output plane pitch + int out_dim3_size; // Output channels per N-tile + + // Coefficient tile dimensions + int coeff_dim1_size; // Kernel width + int coeff_dim2_size; // Kernel height + int coeff_dim3_size; // Input channels + int coeff_dim4_size; // Output channels (total) + int coeff_dim1_pitch; // Kernel row pitch + int coeff_dim2_pitch; // Kernel plane pitch (W*H) + int coeff_dim3_pitch; // Kernel 3D pitch (W*H*D) + + // Bias array dimensions + int bias_dim1_size; // Number of bias values + int bias_dim2_size; // Always 1 + + // Output scale array dimensions + int outscale_dim1_size; // Number of scale values + int outscale_dim2_size; // Always 1 + + // Buffer sizes (bytes) + int input_buffer_size; + int coeff_buffer_size; + int output_buffer_size; + int bias_buffer_size; + int outscale_buffer_size; + + // Buffer DRAM placement (0 = DRAM0, 1 = DRAM1) + int input_ping_dram; + int input_pong_dram; + int coeff_dram; + int output_ping_dram; + int output_pong_dram; + int bias_dram; + int outscale_dram; + + // Tiling parameters + int n_tile_size; // Output channels per N-tile + int n_tiles; // Total number of N-tiles + int n_tile_size_last; // Channels in last N-tile + int height_tiles; // Total number of H-tiles + int output_rows; // Output rows per H-tile + int input_rows; // Input rows needed per H-tile + + // Convolution parameters + int kernel_w; + int kernel_h; + int stride_x; + int stride_y; + int padding; // Symmetric padding + int dilation; + int accum_shift; // Accumulator shift + int relu_max; // ReLU clamp maximum + int relu_min; // ReLU clamp minimum + int output_shift; // Output quantization shift + int output_scale; // Output scale factor + int flags; // Convolution flags + int input_zero_point; // Input zero point for padding fill + +} conv_layer_config_t; + +""") + + # Generate lookup table + f.write(f"// Total number of convolution layers\n") + f.write(f"#define NUM_CONV_LAYERS {len(configs)}\n\n") + + # Generate IDMA buffer size macros + _dram0_macro = 0 if no_dma_mode else dram0_size + _dram1_macro = 0 if no_dma_mode else dram1_size + f.write(f" #define IDMA_BUFFER_SIZE_DRAM0 ({_dram0_macro}) // {_dram0_macro // 1024} KB for DRAM0\n") + f.write(f" #define IDMA_BUFFER_SIZE_DRAM1 ({_dram1_macro}) // {_dram1_macro // 1024} KB for DRAM1\n\n") + + f.write("// Layer configuration lookup table\n") + f.write("static const conv_layer_config_t CONV_LAYER_CONFIGS[] = {\n") + + for config in configs: + f.write(" {\n") + f.write(f" .layer_id = {config['layer_id']},\n") + f.write(f" .layer_name = \"{config['layer_name']}\",\n") + f.write(f" .kernel_name = \"{config['kernel_name']}\",\n") + f.write(f" .config_key = \"{config['config_key']}\",\n") + f.write(f" \n") + + # Source dimensions + f.write(f" // Source (DRAM): {config['src_dim1_size']}×{config['src_dim2_size']}×{config['src_dim3_size']}\n") + f.write(f" .src_dim1_size = {config['src_dim1_size']},\n") + f.write(f" .src_dim2_size = {config['src_dim2_size']},\n") + f.write(f" .src_dim3_size = {config['src_dim3_size']},\n") + f.write(f" .src_dim1_pitch = {config['src_dim1_pitch']},\n") + f.write(f" .src_dim2_pitch = {config['src_dim2_pitch']},\n") + f.write(f" \n") + + # Destination dimensions + f.write(f" // Destination (DRAM): {config['dst_dim1_size']}×{config['dst_dim2_size']}×{config['dst_dim3_size']}\n") + f.write(f" .dst_dim1_size = {config['dst_dim1_size']},\n") + f.write(f" .dst_dim2_size = {config['dst_dim2_size']},\n") + f.write(f" .dst_dim3_size = {config['dst_dim3_size']},\n") + f.write(f" .dst_dim1_pitch = {config['dst_dim1_pitch']},\n") + f.write(f" .dst_dim2_pitch = {config['dst_dim2_pitch']},\n") + f.write(f" \n") + + # Input tile dimensions + f.write(f" // Input tile: {config['in_dim1_size']}×{config['in_dim2_size']} (edges: {config['in_dim1_edge1']},{config['in_dim1_edge2']},{config['in_dim2_edge1']},{config['in_dim2_edge2']})\n") + f.write(f" .in_dim1_size = {config['in_dim1_size']},\n") + f.write(f" .in_dim1_pitch = {config['in_dim1_pitch']},\n") + f.write(f" .in_dim2_size = {config['in_dim2_size']},\n") + f.write(f" .in_dim2_pitch = {config['in_dim2_pitch']},\n") + f.write(f" .in_dim1_edge1 = {config['in_dim1_edge1']},\n") + f.write(f" .in_dim1_edge2 = {config['in_dim1_edge2']},\n") + f.write(f" .in_dim2_edge1 = {config['in_dim2_edge1']},\n") + f.write(f" .in_dim2_edge2 = {config['in_dim2_edge2']},\n") + f.write(f" .in_dim3_edge1 = {config['in_dim3_edge1']},\n") + f.write(f" .in_dim3_edge2 = {config['in_dim3_edge2']},\n") + f.write(f" .in_data_offset = {config['in_data_offset']},\n") + f.write(f" .in_rows_firstdma = {config['in_rows_firstdma']},\n") + f.write(f" \n") + + # Output tile dimensions + f.write(f" // Output tile: {config['out_dim1_size']}×{config['out_dim2_size']}×{config['out_dim3_size']}\n") + f.write(f" .out_dim1_size = {config['out_dim1_size']},\n") + f.write(f" .out_dim1_pitch = {config['out_dim1_pitch']},\n") + f.write(f" .out_dim2_size = {config['out_dim2_size']},\n") + f.write(f" .out_dim2_pitch = {config['out_dim2_pitch']},\n") + f.write(f" .out_dim3_size = {config['out_dim3_size']},\n") + f.write(f" \n") + + # Coefficient dimensions + f.write(f" // Coefficients: {config['coeff_dim1_size']}×{config['coeff_dim2_size']}×{config['coeff_dim3_size']}×{config['coeff_dim4_size']}\n") + f.write(f" .coeff_dim1_size = {config['coeff_dim1_size']},\n") + f.write(f" .coeff_dim2_size = {config['coeff_dim2_size']},\n") + f.write(f" .coeff_dim3_size = {config['coeff_dim3_size']},\n") + f.write(f" .coeff_dim4_size = {config['coeff_dim4_size']},\n") + f.write(f" .coeff_dim1_pitch = {config['coeff_dim1_pitch']},\n") + f.write(f" .coeff_dim2_pitch = {config['coeff_dim2_pitch']},\n") + f.write(f" .coeff_dim3_pitch = {config['coeff_dim3_pitch']},\n") + f.write(f" \n") + + # Bias and outscale + f.write(f" // Bias/Outscale: {config['bias_dim1_size']}\n") + f.write(f" .bias_dim1_size = {config['bias_dim1_size']},\n") + f.write(f" .bias_dim2_size = {config['bias_dim2_size']},\n") + f.write(f" .outscale_dim1_size = {config['outscale_dim1_size']},\n") + f.write(f" .outscale_dim2_size = {config['outscale_dim2_size']},\n") + f.write(f" \n") + + # Buffer sizes + f.write(f" // Buffer sizes (bytes)\n") + f.write(f" .input_buffer_size = {config['input_buffer_size']},\n") + f.write(f" .coeff_buffer_size = {config['coeff_buffer_size']},\n") + f.write(f" .output_buffer_size = {config['output_buffer_size']},\n") + f.write(f" .bias_buffer_size = {config['bias_buffer_size']},\n") + f.write(f" .outscale_buffer_size = {config['outscale_buffer_size']},\n") + f.write(f" \n") + + # DRAM placement + f.write(f" // DRAM placement\n") + f.write(f" .input_ping_dram = {config['input_ping_dram']},\n") + f.write(f" .input_pong_dram = {config['input_pong_dram']},\n") + f.write(f" .coeff_dram = {config['coeff_dram']},\n") + f.write(f" .output_ping_dram = {config['output_ping_dram']},\n") + f.write(f" .output_pong_dram = {config['output_pong_dram']},\n") + f.write(f" .bias_dram = {config['bias_dram']},\n") + f.write(f" .outscale_dram = {config['outscale_dram']},\n") + f.write(f" \n") + + # Tiling parameters + f.write(f" // Tiling: {config['n_tile_size']} ch/tile × {config['n_tiles']} tiles, {config['output_rows']} rows/tile × {config['height_tiles']} tiles\n") + f.write(f" .n_tile_size = {config['n_tile_size']},\n") + f.write(f" .n_tiles = {config['n_tiles']},\n") + f.write(f" .n_tile_size_last = {config['n_tile_size_last']},\n") + f.write(f" .height_tiles = {config['height_tiles']},\n") + f.write(f" .output_rows = {config['output_rows']},\n") + f.write(f" .input_rows = {config['input_rows']},\n") + f.write(f" \n") + + # Convolution parameters + f.write(f" // Conv params: {config['kernel_w']}×{config['kernel_h']}, stride {config['stride_x']}×{config['stride_y']}, pad {config['padding']}\n") + f.write(f" .kernel_w = {config['kernel_w']},\n") + f.write(f" .kernel_h = {config['kernel_h']},\n") + f.write(f" .stride_x = {config['stride_x']},\n") + f.write(f" .stride_y = {config['stride_y']},\n") + f.write(f" .padding = {config['padding']},\n") + f.write(f" .dilation = {config['dilation']},\n") + f.write(f" .accum_shift = {config['accum_shift']},\n") + f.write(f" .relu_max = {config['relu_max']},\n") + f.write(f" .relu_min = {config['relu_min']},\n") + f.write(f" .output_shift = {config['output_shift']},\n") + f.write(f" .output_scale = {config['output_scale']},\n") + f.write(f" .flags = {config['flags']},\n") + f.write(f" .input_zero_point = {config['input_zero_point']},\n") + f.write(" },\n") + + f.write("};\n\n") + + # Generate accessor functions + f.write(""" +/** + * Get total number of convolution layers + */ +static inline int get_num_conv_layers(void) { + return NUM_CONV_LAYERS; +} + +/** + * Get configuration for a specific layer by layer_id + * + * @param layer_id Layer index (0 to NUM_CONV_LAYERS-1) + * @return Pointer to configuration, or NULL if invalid layer_id + */ +static inline const conv_layer_config_t* get_layer_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) { + return NULL; + } + return &CONV_LAYER_CONFIGS[layer_id]; +} + +/** + * Get configuration for a layer by its parameters + * Searches for a layer matching the given convolution parameters + * + * @param ic Input channels + * @param ih Input height + * @param iw Input width + * @param oc Output channels + * @param kh Kernel height + * @param kw Kernel width + * @param oh Output height + * @param ow Output width + * @param sy Stride Y + * @param sx Stride X + * @param pad Padding (symmetric) + * @param dil Dilation + * @return Pointer to configuration, or NULL if not found + */ +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) { + + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) { + return cfg; + } + } + return NULL; +} + +/** + * Get configuration for a layer by config key string + * Key format: "ic_ih_iw_oc_kh_kw_oh_ow_sy_sx_pad_dil" + * + * @param config_key The unique configuration key string + * @return Pointer to configuration, or NULL if not found + */ +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + // Simple string comparison + const char* a = cfg->config_key; + const char* b = config_key; + int match = 1; + while (*a && *b) { + if (*a++ != *b++) { match = 0; break; } + } + if (match && *a == *b) return cfg; + } + return NULL; +} + +#endif // CONV_LAYER_CONFIGS_H +""") + + print(f"Generated {output_file}") + +def main(): + parser = argparse.ArgumentParser( + description='Generate convolution layer configuration lookup table', + epilog='One of --model, --pte, or a positional input_file (csv/json) is required.' + ) + parser.add_argument('input_file', nargs='?', default=None, + help='Input file (layers_config.json or resnet_conv_list.csv). ' + 'Not needed when using --model or --pte.') + parser.add_argument('--model', '-m', default=None, + help='Extract layers directly from PyTorch model(s). ' + 'Comma or + separated list. ' + f'Supported: {", ".join(SUPPORTED_MODELS)}. ' + 'Example: --model resnet18+resnet50') + parser.add_argument('--pte', nargs='+', default=None, + help='Extract layers from one or more ExecuTorch .pte binaries. ' + 'Example: --pte resnet18.pte resnet50.pte') + parser.add_argument('--flatc', default=None, + help='Path to flatc binary (default: cmake-out/third-party/flatc_ep/bin/flatc)') + parser.add_argument('--input-size', default='1,3,64,64', + help='Model input tensor shape as N,C,H,W (default: 1,3,64,64)') + parser.add_argument('--output', '-o', default='conv_layer_configs.h', + help='Output C header file (default: conv_layer_configs.h)') + parser.add_argument('--dram0', type=int, default=DRAM_SIZE_0, + help=f'DRAM0 size in bytes (default: {DRAM_SIZE_0})') + parser.add_argument('--dram1', type=int, default=DRAM_SIZE_1, + help=f'DRAM1 size in bytes (default: {DRAM_SIZE_1})') + parser.add_argument('--no-dma-mode', action='store_true', default=False, + help='Force all configs to no-DMA mode: changes _dma suffix to _no_dma for every kernel name') + + args = parser.parse_args() + + # ---- Load layers: --model, --pte, or input_file ---- + if args.pte: + all_layers = [] + seen_keys = set() + for pte_arg in args.pte: + pte_path = Path(pte_arg) + if not pte_path.exists(): + print(f"ERROR: PTE file not found: {pte_path}") + return 1 + print(f"Extracting layers from PTE: {pte_path}") + pte_layers = load_layers_from_pte(pte_path, flatc_path=args.flatc) + for l in pte_layers: + key = (l['input'], l['output'], l['kernel'], + l['stride'], l['padding'], l['dilation']) + if key not in seen_keys: + seen_keys.add(key) + l['layer_id'] = len(all_layers) + all_layers.append(l) + else: + print(f" [skip duplicate] {l['name']}") + layers = all_layers + print(f"Total unique layers from {len(args.pte)} PTE file(s): {len(layers)}") + elif args.model: + # Parse model names (accept comma or + as separator) + model_names = [n.strip() for n in args.model.replace('+', ',').split(',') if n.strip()] + input_size = tuple(int(x) for x in args.input_size.split(',')) + print(f"Extracting layers from model(s): {', '.join(model_names)} input_size={input_size}") + layers = load_layers_from_model(model_names, input_size) + elif args.input_file: + input_path = Path(args.input_file) + if not input_path.exists(): + print(f"ERROR: Input file not found: {input_path}") + return 1 + print(f"Loading layers from {input_path}...") + if input_path.suffix == '.json': + layers = load_layers_from_json(input_path) + elif input_path.suffix == '.csv': + layers = load_layers_from_csv(input_path) + else: + print(f"ERROR: Unsupported file type: {input_path.suffix}") + print("Supported: .json, .csv") + return 1 + else: + parser.error('One of --model, --pte, or a positional input_file is required.') + return 1 + + print(f"Loaded {len(layers)} layers") + + # Calculate configurations for all layers + print(f"\nCalculating buffer configurations (DRAM0={args.dram0}, DRAM1={args.dram1})...") + configs = [] + for layer in layers: + print(f" Processing layer {layer['layer_id']}: {layer['name']}...") + config = calculate_layer_config(layer, args.dram0, args.dram1) + if config: + configs.append(config) + print(f" [OK] n_tile={config['n_tile_size']}, n_tiles={config['n_tiles']}, " + f"output_rows={config['output_rows']}, height_tiles={config['height_tiles']}") + else: + print(f" ✗ Failed to calculate configuration") + + if len(configs) == 0: + print("ERROR: No valid configurations generated") + return 1 + + print(f"\nGenerated {len(configs)} valid configurations") + + # Apply no-DMA mode: change _dma suffix to _no_dma for every kernel name + if args.no_dma_mode: + for config in configs: + if config['kernel_name'].endswith('_dma'): + config['kernel_name'] = config['kernel_name'][:-4] + '_no_dma' + print(f"No-DMA mode enabled: all kernel names suffixed with _no_dma") + + # Generate C header + generate_c_header(configs, args.output, args.dram0, args.dram1, args.no_dma_mode) + + print(f"\nSuccess! Generated {args.output}") + print(f"Use in C code:") + print(f" #include \"{args.output}\"") + print(f" const conv_layer_config_t* config = get_layer_config(0);") + print(f" conv_execute_layer(0, input, output, weights, bias, outscale);") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/backends/cadence/vision/config_generator/layer_configs_16k.h b/backends/cadence/vision/config_generator/layer_configs_16k.h new file mode 100644 index 00000000000..13df04f97c5 --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_16k.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (16384) /* 16 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (16384) /* 16 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 13, + .in_dim2_pitch = 910, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 10, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 4, + .out_dim2_pitch = 128, + .out_dim3_size = 64, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 2730, + .coeff_buffer_size = 9408, + .output_buffer_size = 8192, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 8, + .output_rows = 4, + .input_rows = 13, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 4, + .in_dim2_pitch = 72, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 3, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 4608, + .coeff_buffer_size = 9216, + .output_buffer_size = 512, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 4, + .n_tile_size_last = 16, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 4, + .out_dim2_pitch = 32, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 8192, + .output_buffer_size = 4096, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 4, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 9216, + .output_buffer_size = 256, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 8, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 9216, + .output_buffer_size = 128, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 16, + .n_tile_size_last = 8, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 8192, + .output_buffer_size = 512, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 4, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 9216, + .output_buffer_size = 64, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 32, + .n_tile_size_last = 8, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 9216, + .output_buffer_size = 32, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 64, + .n_tile_size_last = 4, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 8192, + .output_buffer_size = 128, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 16, + .n_tile_size_last = 32, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 7680, + .coeff_buffer_size = 9216, + .output_buffer_size = 16, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 128, + .n_tile_size_last = 4, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 5, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 9216, + .output_buffer_size = 8, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 2, + .n_tiles = 256, + .n_tile_size_last = 2, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 8192, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 2, + .n_tile_size_last = 128, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 6, + .in_dim2_pitch = 96, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 6, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 6, + .out_dim2_pitch = 96, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 4096, + .output_buffer_size = 6144, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 3, + .output_rows = 6, + .input_rows = 6, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 1024, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 2, + .n_tile_size_last = 32, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 4096, + .output_buffer_size = 256, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11520, + .coeff_buffer_size = 4608, + .output_buffer_size = 64, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 32, + .n_tile_size_last = 4, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 8192, + .output_buffer_size = 1024, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 8, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 256, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 8, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 1024, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 524288, + .output_buffer_size = 16384, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 1024, + .n_tiles = 1, + .n_tile_size_last = 1024, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 1, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 2304, + .output_buffer_size = 8, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 1, + .n_tiles = 256, + .n_tile_size_last = 1, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 8192, + .output_buffer_size = 256, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 32, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 8, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 64, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 32, + .n_tile_size_last = 8, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2048, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 2097152, + .output_buffer_size = 8192, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 2048, + .n_tiles = 1, + .n_tile_size_last = 2048, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 8, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 18432, + .coeff_buffer_size = 2359296, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 8192, + .output_buffer_size = 64, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 128, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 4, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 8192, + .output_buffer_size = 16, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 128, + .n_tile_size_last = 4, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 22, + .c_tiles = 3, + .c_tile_size_last = 20, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 14960, + .output_buffer_size = 1408, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/config_generator/layer_configs_24k.h b/backends/cadence/vision/config_generator/layer_configs_24k.h new file mode 100644 index 00000000000..0a78e6165f5 --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_24k.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (24576) /* 24 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (24576) /* 24 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 17, + .in_dim2_pitch = 1190, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 14, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 6, + .out_dim2_pitch = 192, + .out_dim3_size = 64, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 3570, + .coeff_buffer_size = 9408, + .output_buffer_size = 12288, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 6, + .output_rows = 6, + .input_rows = 17, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 4, + .in_dim2_pitch = 72, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 3, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 4608, + .coeff_buffer_size = 18432, + .output_buffer_size = 1024, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 2, + .n_tile_size_last = 32, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 11, + .in_dim2_pitch = 176, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 11, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 6, + .out_dim2_pitch = 48, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11264, + .coeff_buffer_size = 8192, + .output_buffer_size = 6144, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 6, + .input_rows = 11, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 18432, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 18432, + .output_buffer_size = 256, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 8, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 16384, + .output_buffer_size = 1024, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 2, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 18432, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 18432, + .output_buffer_size = 64, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 32, + .n_tile_size_last = 8, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 16384, + .output_buffer_size = 256, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 8, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 7680, + .coeff_buffer_size = 18432, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 5, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 18432, + .output_buffer_size = 16, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 128, + .n_tile_size_last = 4, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 3, + .out_dim2_pitch = 48, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 16384, + .output_buffer_size = 12288, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 6, + .output_rows = 3, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 10, + .in_dim2_pitch = 160, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 10, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 10, + .out_dim2_pitch = 160, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 10240, + .coeff_buffer_size = 4096, + .output_buffer_size = 10240, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 10, + .input_rows = 10, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 3, + .out_dim2_pitch = 48, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 16384, + .output_buffer_size = 3072, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 6, + .output_rows = 3, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 16384, + .output_buffer_size = 1024, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 8, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 2048, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11520, + .coeff_buffer_size = 18432, + .output_buffer_size = 256, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 8, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 16384, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 16384, + .output_buffer_size = 256, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 32, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 512, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 9216, + .output_buffer_size = 32, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 64, + .n_tile_size_last = 4, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 16384, + .output_buffer_size = 512, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 16, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 8192, + .output_buffer_size = 32, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 256, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 128, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 9216, + .output_buffer_size = 8, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 2, + .n_tiles = 256, + .n_tile_size_last = 2, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 16384, + .output_buffer_size = 128, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 64, + .n_tile_size_last = 32, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 33, + .c_tiles = 2, + .c_tile_size_last = 31, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 22440, + .output_buffer_size = 2112, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/config_generator/layer_configs_32k.h b/backends/cadence/vision/config_generator/layer_configs_32k.h new file mode 100644 index 00000000000..efa2c7a64ee --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_32k.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (32768) /* 32 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (32768) /* 32 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 21, + .in_dim2_pitch = 1470, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 18, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 8, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 4410, + .coeff_buffer_size = 9408, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 8, + .input_rows = 21, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 4, + .in_dim2_pitch = 72, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 3, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 4608, + .coeff_buffer_size = 18432, + .output_buffer_size = 1024, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 2, + .n_tile_size_last = 32, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 15, + .in_dim2_pitch = 240, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 15, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 8192, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 15, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 18432, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 18432, + .output_buffer_size = 256, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 8, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 16384, + .output_buffer_size = 1024, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 2, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 18432, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 18432, + .output_buffer_size = 64, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 32, + .n_tile_size_last = 8, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 16384, + .output_buffer_size = 256, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 8, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 7680, + .coeff_buffer_size = 18432, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 5, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 18432, + .output_buffer_size = 16, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 128, + .n_tile_size_last = 4, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 4, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 4, + .out_dim2_pitch = 64, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 4096, + .coeff_buffer_size = 16384, + .output_buffer_size = 16384, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 4, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 14, + .in_dim2_pitch = 224, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 14, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 14, + .out_dim2_pitch = 224, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 14336, + .coeff_buffer_size = 4096, + .output_buffer_size = 14336, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 14, + .input_rows = 14, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 4, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 4, + .out_dim2_pitch = 64, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 16384, + .output_buffer_size = 4096, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 16384, + .output_buffer_size = 1024, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 8, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 2048, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11520, + .coeff_buffer_size = 18432, + .output_buffer_size = 256, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 8, + .n_tile_size_last = 16, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 16384, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 16384, + .output_buffer_size = 256, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 32, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 512, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 18432, + .output_buffer_size = 64, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 32, + .n_tile_size_last = 8, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 16384, + .output_buffer_size = 512, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 16, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 16384, + .output_buffer_size = 64, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 128, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 128, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 18432, + .output_buffer_size = 16, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 128, + .n_tile_size_last = 4, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 16384, + .output_buffer_size = 128, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 64, + .n_tile_size_last = 32, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 16384, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 44, + .c_tiles = 2, + .c_tile_size_last = 20, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 29920, + .output_buffer_size = 2816, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/config_generator/layer_configs_4k.h b/backends/cadence/vision/config_generator/layer_configs_4k.h new file mode 100644 index 00000000000..481adbd63b9 --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_4k.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (4096) /* 4 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (4096) /* 4 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 9, + .in_dim2_pitch = 630, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 6, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 2, + .out_dim2_pitch = 64, + .out_dim3_size = 8, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 1890, + .coeff_buffer_size = 1176, + .output_buffer_size = 512, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 8, + .n_tile_size_last = 8, + .height_tiles = 16, + .output_rows = 2, + .input_rows = 9, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 16, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 20736, + .coeff_buffer_size = 36864, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 8, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 512, + .output_buffer_size = 128, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 16, + .n_tile_size_last = 8, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 16, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 20736, + .coeff_buffer_size = 73728, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 16, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 8, + .in_dim2_pitch = 100, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 147456, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 8, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 8, + .in_dim2_pitch = 100, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 294912, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 9216, + .coeff_buffer_size = 589824, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 4096, + .coeff_buffer_size = 131072, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 9216, + .coeff_buffer_size = 1179648, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 2359296, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 1024, + .output_buffer_size = 512, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 1024, + .output_buffer_size = 512, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 4, + .n_tile_size_last = 16, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 65536, + .coeff_buffer_size = 16384, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 65536, + .coeff_buffer_size = 131072, + .output_buffer_size = 32768, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 65536, + .coeff_buffer_size = 32768, + .output_buffer_size = 32768, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 16, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 41472, + .coeff_buffer_size = 147456, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 16, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 4, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 512, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 128, + .n_tile_size_last = 4, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 65536, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 1024, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 524288, + .output_buffer_size = 16384, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 1024, + .n_tiles = 1, + .n_tile_size_last = 1024, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 131072, + .output_buffer_size = 16384, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 8, + .in_dim2_pitch = 100, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 25600, + .coeff_buffer_size = 589824, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 1024, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 4096, + .coeff_buffer_size = 262144, + .output_buffer_size = 16384, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 1024, + .n_tiles = 1, + .n_tile_size_last = 1024, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 262144, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2048, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 2097152, + .output_buffer_size = 8192, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 2048, + .n_tiles = 1, + .n_tile_size_last = 2048, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 524288, + .output_buffer_size = 8192, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 18432, + .coeff_buffer_size = 2359296, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2048, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 1048576, + .output_buffer_size = 8192, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 2048, + .n_tiles = 1, + .n_tile_size_last = 2048, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 1048576, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 5, + .c_tiles = 13, + .c_tile_size_last = 4, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 3400, + .output_buffer_size = 320, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/config_generator/layer_configs_61k.h b/backends/cadence/vision/config_generator/layer_configs_61k.h new file mode 100644 index 00000000000..3f47d4533eb --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_61k.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (62976) /* 61 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (62976) /* 61 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 35, + .in_dim2_pitch = 2450, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 32, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 15, + .out_dim2_pitch = 480, + .out_dim3_size = 64, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 7350, + .coeff_buffer_size = 9408, + .output_buffer_size = 30720, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 3, + .output_rows = 15, + .input_rows = 35, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 18, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 17, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 20736, + .coeff_buffer_size = 36864, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 18, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 15, + .in_dim2_pitch = 240, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 15, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 8192, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 15, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 36864, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 36864, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 7, + .in_dim2_pitch = 56, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 36864, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 36864, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 32768, + .output_buffer_size = 512, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 7680, + .coeff_buffer_size = 36864, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 5, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 36864, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 7, + .out_dim2_pitch = 112, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 16384, + .output_buffer_size = 28672, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 3, + .output_rows = 7, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 4096, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 10, + .in_dim2_pitch = 160, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 10, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 10, + .out_dim2_pitch = 160, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 40960, + .coeff_buffer_size = 16384, + .output_buffer_size = 10240, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 10, + .input_rows = 10, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 7, + .out_dim2_pitch = 112, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 28672, + .coeff_buffer_size = 32768, + .output_buffer_size = 14336, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 3, + .output_rows = 7, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11520, + .coeff_buffer_size = 36864, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 2, + .n_tile_size_last = 256, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 512, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 16, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 4, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 36864, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 8, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 128, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 64, + .n_tile_size_last = 32, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 16, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 36864, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 32, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 64, + .c_tiles = 1, + .c_tile_size_last = 64, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 43520, + .output_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/config_generator/layer_configs_8k.h b/backends/cadence/vision/config_generator/layer_configs_8k.h new file mode 100644 index 00000000000..de165698252 --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_8k.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (8192) /* 8 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (8192) /* 8 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 9, + .in_dim2_pitch = 630, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 6, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 2, + .out_dim2_pitch = 64, + .out_dim3_size = 32, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 1890, + .coeff_buffer_size = 4704, + .output_buffer_size = 2048, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 2, + .n_tile_size_last = 32, + .height_tiles = 16, + .output_rows = 2, + .input_rows = 9, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 4, + .in_dim2_pitch = 72, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 3, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 4608, + .coeff_buffer_size = 2304, + .output_buffer_size = 128, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 16, + .n_tile_size_last = 4, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 4096, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 4, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 2304, + .output_buffer_size = 64, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 4, + .n_tiles = 32, + .n_tile_size_last = 4, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 2, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 2304, + .output_buffer_size = 32, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 2, + .n_tiles = 64, + .n_tile_size_last = 2, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 4096, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 1, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 1152, + .output_buffer_size = 8, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 1, + .n_tiles = 256, + .n_tile_size_last = 1, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 9216, + .coeff_buffer_size = 589824, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 4096, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 9216, + .coeff_buffer_size = 1179648, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 2359296, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 4096, + .output_buffer_size = 2048, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 4, + .n_tile_size_last = 64, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 2, + .in_dim2_pitch = 32, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 2, + .out_dim2_pitch = 32, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 4096, + .output_buffer_size = 2048, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 8, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 65536, + .coeff_buffer_size = 16384, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 65536, + .coeff_buffer_size = 131072, + .output_buffer_size = 32768, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 65536, + .coeff_buffer_size = 32768, + .output_buffer_size = 32768, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 16, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 41472, + .coeff_buffer_size = 147456, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 16, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 4096, + .output_buffer_size = 512, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 16, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 65536, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 1024, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 524288, + .output_buffer_size = 16384, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 1024, + .n_tiles = 1, + .n_tile_size_last = 1024, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 8, + .in_dim2_pitch = 64, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 32768, + .coeff_buffer_size = 131072, + .output_buffer_size = 16384, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 8, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 8, + .in_dim2_pitch = 100, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 8, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 25600, + .coeff_buffer_size = 589824, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 8, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 4096, + .output_buffer_size = 128, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 64, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 262144, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2048, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 2097152, + .output_buffer_size = 8192, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 2048, + .n_tiles = 1, + .n_tile_size_last = 2048, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 524288, + .output_buffer_size = 8192, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 4, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 36, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 18432, + .coeff_buffer_size = 2359296, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 2048, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 1048576, + .output_buffer_size = 8192, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 2048, + .n_tiles = 1, + .n_tile_size_last = 2048, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 512, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 1048576, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 0, + .output_pong_dram = 0, + .bias_dram = 0, + .outscale_dram = 0, + .n_tile_size = 512, + .n_tiles = 1, + .n_tile_size_last = 512, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 11, + .c_tiles = 6, + .c_tile_size_last = 9, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 7480, + .output_buffer_size = 704, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/config_generator/layer_configs_cache.h b/backends/cadence/vision/config_generator/layer_configs_cache.h new file mode 100644 index 00000000000..c88a0b41f81 --- /dev/null +++ b/backends/cadence/vision/config_generator/layer_configs_cache.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (0) /* 0 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (0) /* 0 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_no_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 35, + .in_dim2_pitch = 2450, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 32, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 15, + .out_dim2_pitch = 480, + .out_dim3_size = 64, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 7350, + .coeff_buffer_size = 9408, + .output_buffer_size = 30720, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 3, + .output_rows = 15, + .input_rows = 35, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 18, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 17, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 20736, + .coeff_buffer_size = 36864, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 18, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 15, + .in_dim2_pitch = 240, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 15, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 8192, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 15, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 36864, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 36864, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 7, + .in_dim2_pitch = 56, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 36864, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 36864, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 32768, + .output_buffer_size = 512, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 7680, + .coeff_buffer_size = 36864, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_no_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 5, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 36864, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 7, + .out_dim2_pitch = 112, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 16384, + .output_buffer_size = 28672, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 3, + .output_rows = 7, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 4096, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 10, + .in_dim2_pitch = 160, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 10, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 10, + .out_dim2_pitch = 160, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 40960, + .coeff_buffer_size = 16384, + .output_buffer_size = 10240, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 10, + .input_rows = 10, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 7, + .out_dim2_pitch = 112, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 28672, + .coeff_buffer_size = 32768, + .output_buffer_size = 14336, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 3, + .output_rows = 7, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11520, + .coeff_buffer_size = 36864, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 2, + .n_tile_size_last = 256, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 512, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 16, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 4, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 36864, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 8, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_no_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 128, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 64, + .n_tile_size_last = 32, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 16, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_no_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 36864, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 32, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_no_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 64, + .c_tiles = 1, + .c_tile_size_last = 64, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 43520, + .output_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/kernels/CMakeLists.txt b/backends/cadence/vision/kernels/CMakeLists.txt index fa7b2b5203b..dc8d73b5d5b 100644 --- a/backends/cadence/vision/kernels/CMakeLists.txt +++ b/backends/cadence/vision/kernels/CMakeLists.txt @@ -8,6 +8,7 @@ add_library( cadence_kernels kernels.cpp + ${EXECUTORCH_ROOT}/backends/cadence/generic/kernels/kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/tensor_transposef.c ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/vsoftmaxf.c ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/expf_tbl.c @@ -22,7 +23,8 @@ set(_common_include_directories target_include_directories( cadence_kernels - PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include + PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/generic/kernels + ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include_private ${_common_include_directories} ) diff --git a/backends/cadence/vision/operators/CMakeLists.txt b/backends/cadence/vision/operators/CMakeLists.txt index 38e4f97f841..7e458a56e31 100644 --- a/backends/cadence/vision/operators/CMakeLists.txt +++ b/backends/cadence/vision/operators/CMakeLists.txt @@ -25,6 +25,12 @@ set(_aten_ops__srcs "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/op_softmax.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_max_pool2d_with_indices.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/maxpool/maxpool_exec_mxnj2.c" + "${CMAKE_CURRENT_SOURCE_DIR}/mean/mean_exec_dma.c" + "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_relu_out.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp" @@ -41,7 +47,6 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" @@ -69,41 +74,55 @@ set(_aten_ops__srcs ) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) -target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) +target_link_libraries(aten_ops_cadence PRIVATE xa_nnlib cadence_kernels idma) # Let files say "include ". -set(_common_include_directories - ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 -) +set(_common_include_directories ${EXECUTORCH_ROOT}/.. +${EXECUTORCH_ROOT}/runtime/core/portable_type/c10) target_include_directories( - aten_ops_cadence - PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} ${_common_include_directories} - ${CMAKE_CURRENT_SOURCE_DIR}/../third-party + aten_ops_cadence + PUBLIC ${ROOT_DIR}/.. + ${CMAKE_BINARY_DIR} + ${_common_include_directories} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include ) # Custom ops that are needed to run the test model. add_library( custom_ops - "op_quantized_linear_out.cpp" - "op_quantized_conv_out.cpp" - "op_quantized_relu_out.cpp" - "op_quantized_layer_norm.cpp" - "op_quantize_per_tensor.cpp" - "op_quantized_fully_connected_out.cpp" - "op_dequantize_per_tensor.cpp" - "op_quantized_matmul_out.cpp" - "op_requantize_out.cpp" - "op_im2row_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_conv_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_kernel_dispatcher.c" + "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_7x7j2d1.c" + "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_1x1j1d1.c" + "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_1x1j2d1.c" + "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_3x3j1d1.c" + "${CMAKE_CURRENT_SOURCE_DIR}/conv/conv_exec_3x3j2d1.c" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_relu_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_layer_norm.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize_per_tensor.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_linear_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_fully_connected_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize_per_tensor.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantized_matmul_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_requantize_out.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_im2row_out.cpp" ) +target_link_libraries(custom_ops PUBLIC executorch) +target_link_libraries(custom_ops PRIVATE xa_nnlib cadence_kernels idma xai) + target_include_directories( - custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} - ${_common_include_directories} + custom_ops + PUBLIC + ${ROOT_DIR}/.. + ${CMAKE_BINARY_DIR} + ${_common_include_directories} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/libxai/include + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/libxai_common/include ) -target_link_libraries(custom_ops PUBLIC executorch) -target_link_libraries(custom_ops PRIVATE cadence_kernels) - # Generate C++ bindings to register kernels into both PyTorch (for AOT) and # Executorch (for runtime). Here select all ops in functions_vision.yaml gen_selected_ops( @@ -119,6 +138,3 @@ message("Generated cadence x86 files ${gen_command_sources}") gen_operators_lib( LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence ) - -# Link custom_ops to the generated library to ensure the symbols are available -target_link_libraries(cadence_ops_lib PUBLIC custom_ops) diff --git a/backends/cadence/vision/operators/TARGETS b/backends/cadence/vision/operators/TARGETS new file mode 100644 index 00000000000..67f2bab681a --- /dev/null +++ b/backends/cadence/vision/operators/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c b/backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c new file mode 100644 index 00000000000..ad92342bd48 --- /dev/null +++ b/backends/cadence/vision/operators/conv/conv_exec_1x1j1d1.c @@ -0,0 +1,1023 @@ +#include "kernel_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include "utils.h" +#include +#include +#include + +// VQ (per-channel output scaling) DMA version +XAI_ERR_TYPE conv_exec_1x1j1d1VQ( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, + config->outscale_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias || !p_outscale) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_array tile_outscale; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + // Transfer constant data (all buffers are 64-byte aligned by test harness) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First input transfer: load IN_ROWS_FIRSTDMA rows at offset 0 + dma_3dm(1, + (void*)src, + (void*)&p_input0[0], // No offset for 1x1 + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias + outscale on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[0]); // No offset for 1x1 + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0); + // DIM2 = IN_ROWS_FIRSTDMA (no edge to subtract for 1x1) + XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // Frame size for edge extension (not actually needed for 1x1, but harmless) + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias/outscale for N-tile + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // Calculate actual rows for this height tile (handle last tile edge case) + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + // current_output_rows used below for DMA size // Used for potential future tile dimension adjustments + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + // Prefetch next input tile + // For stride-1: each output row comes from one input row, so load 2 input rows for 2 output rows + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm(1, + &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]), + &(p_input1[0]), // No offset for 1x1 kernel (IN_DATA_OFFSET = 0) + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->input_rows, + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + // Update tile descriptors for current height tile + // For 1x1 kernel: data always starts at buffer offset 0 (no edge padding) + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0])); // IN_DATA_OFFSET = 0 for 1x1 kernel + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Convolution (no edge extension needed for 1x1) + // ================================================================ + XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_outscale), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // Prefetch next coefficient tile (if needed) + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + dma_1dm(0,/* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), /* dst */ &(p_coeff[0]), /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + // Write output tile to system memory + // dma_2dm params: src, dst, src_pitch, dst_pitch, row_size, num_rows + // row_size = actual bytes for this tile (may be less for last height tile) + // num_rows = current_n_size (number of output channels in this N-tile) + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// VQ (per-channel output scaling) cache version +// All data stays in system memory and is accessed through processor cache +XAI_ERR_TYPE conv_exec_1x1j1d1VQ_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (shared across cache kernels) + // For 1x1 convolution, edges are typically 0, but we still use the pattern + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + // Get shared padded input buffer from allocator + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + // Zero-fill the padded buffer + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // ======================================================================== + // Copy raw input to padded buffer and extend edges + // ======================================================================== +#ifdef USE_DMA_FOR_CACHE_COPY + // Use DMA 3D transfer to copy input data into padded buffer at data_offset + dma_3dm(0, + /* src */ src, + /* dst */ &padded_input[data_offset], + /* src_row_pitch */ config->src_dim1_pitch, + /* dst_row_pitch */ dim1_pitch, + /* src_tile_pitch */ config->src_dim2_pitch, + /* dst_tile_pitch */ dim2_pitch, + /* row_sz */ config->src_dim1_size, + /* nrows */ config->src_dim2_size, + /* ntiles */ config->src_dim3_size); +#else + // Use library tile copy function (no DMA required) + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; +#endif + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + xai_array tile_outscale; + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor (points to system memory) + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution using generic system-memory API + // This version accesses data through the processor cache + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, + &tile_outscale, &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} + +// ============================================================================ +// Non-VQ (per-tensor output scaling) versions +// ============================================================================ + +// Non-VQ DMA version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_1x1j1d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed) + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data (no outscale) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First input transfer + dma_3dm(1, + (void*)src, + (void*)&p_input0[0], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[0]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias for N-tile (no outscale) + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + + // ================================================================ + // Prefetch Next Input Tile + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm(1, + &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]), + &(p_input1[0]), + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->input_rows, + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Convolution (non-VQ API) + // ================================================================ + XAI_ERR_TYPE status = xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // Prefetch next coefficient tile + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// Non-VQ cache version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_1x1j1d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (SIMD-aligned dim1_pitch required) + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // Copy raw input to padded buffer and extend edges + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution (non-VQ API) + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, + &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} \ No newline at end of file diff --git a/backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c b/backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c new file mode 100644 index 00000000000..2badf62536c --- /dev/null +++ b/backends/cadence/vision/operators/conv/conv_exec_1x1j2d1.c @@ -0,0 +1,1132 @@ +#include "kernel_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include "utils.h" +#include +#include +#include + +// VQ (per-channel output scaling) DMA version +XAI_ERR_TYPE conv_exec_1x1j2d1VQ( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, + config->outscale_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias || !p_outscale) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_array tile_outscale; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + // Transfer constant data (all buffers are 64-byte aligned by test harness) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size); + + // Initialize input buffer and load first tile + // Note: For 1x1 kernel, IN_DATA_OFFSET = 0, no edge padding needed + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at offset 0 (no edge padding) + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], // in_data_offset = 0 for 1x1 + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias + outscale on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // For 1x1: no edge padding, IN_DATA_OFFSET = 0 + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); // 0 for 1x1 + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); // 0 for 1x1 + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); // 0 for 1x1 + // DIM2 = IN_ROWS_FIRSTDMA - EDGE1 = IN_ROWS_FIRSTDMA (edge=0 for 1x1) + XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma - config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); // 0 for 1x1 + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); // 0 for 1x1 + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension (even though edge=0, still needed) + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias/outscale for N-tile + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // Calculate actual rows for this height tile (handle last tile edge case) + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + (void)current_output_rows; // Used for potential future tile dimension adjustments + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm(1, + &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]), + &(p_input1[0]), // No offset for 1x1 kernel (IN_DATA_OFFSET = 0) + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->input_rows, + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + // Update tile descriptors for current height tile + // For 1x1 kernel: data always starts at buffer offset 0 (no edge padding) + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0])); // IN_DATA_OFFSET = 0 for 1x1 kernel + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Convolution + // ================================================================ + XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_outscale), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // ================================================================ + // Prefetch next coefficient tile (if needed for multi-N-tile layers) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + + inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + + dma_1dm(0, + /* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), + /* dst */ &(p_coeff[0]), + /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + // Write output tile to system memory + // dma_2dm params: src, dst, src_pitch, dst_pitch, row_size, num_rows + // row_size = out_dim1_pitch * current_output_rows (actual valid bytes per channel) + // num_rows = current_n_size (number of output channels in this N-tile) + // Note: src_stride stays at out_dim2_pitch since local buffer has fixed pitch + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// VQ (per-channel output scaling) cache version +// All data stays in system memory and is accessed through processor cache +XAI_ERR_TYPE conv_exec_1x1j2d1VQ_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (shared across cache kernels) + // For 1x1 convolution, edges are typically 0, but we still use the pattern + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + // Get shared padded input buffer from allocator + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // ======================================================================== + // Copy raw input to padded buffer and extend edges + // ======================================================================== +#ifdef USE_DMA_FOR_CACHE_COPY + // Use DMA 3D transfer to copy input data into padded buffer at data_offset + dma_3dm(0, + /* src */ src, + /* dst */ &padded_input[data_offset], + /* src_row_pitch */ config->src_dim1_pitch, + /* dst_row_pitch */ dim1_pitch, + /* src_tile_pitch */ config->src_dim2_pitch, + /* dst_tile_pitch */ dim2_pitch, + /* row_sz */ config->src_dim1_size, + /* nrows */ config->src_dim2_size, + /* ntiles */ config->src_dim3_size); +#else + // Use library tile copy function (no DMA required) + xaiCopyTile3D(&src_raw, &tile_input, true); +#endif + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Pre-subsample input when numInCh > 64 (gather instruction workaround) + // The XAI 1x1j2d1 kernel uses gather instructions for numInCh > 64. + // Gather only works on local DRAM, not system memory (cache mode). + // Fix: apply stride manually, then call with stride=1 → dispatches to + // 1x1j1d1 kernel which has an aligned path that uses regular loads. + // ======================================================================== + int need_prestride = (config->src_dim3_size > 2 * XCHAL_IVPN_SIMD_WIDTH) + && (config->stride_x > 1); + if (need_prestride) { + int pre_outW = config->dst_dim1_size; + int pre_outH = config->dst_dim2_size; + int pre_d1_pitch = (pre_outW + 2*XCHAL_IVPN_SIMD_WIDTH - 1) + & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int pre_d2_pitch = pre_d1_pitch * pre_outH; + int pre_buf_size = pre_d2_pitch * config->src_dim3_size; + int stride = config->stride_x; + + // Place pre-subsampled data after padded input (128-byte aligned) + int pre_offset = (input_buffer_size + 127) & ~127; + int8_t* pre_input = &padded_input[pre_offset]; + + if ((pre_offset + pre_buf_size) > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(pre_input, config->input_zero_point, pre_buf_size); + + // Subsample: pick every stride-th pixel in W and H + int8_t* orig = &padded_input[data_offset]; + for (int d = 0; d < config->src_dim3_size; d++) { + for (int oy = 0; oy < pre_outH; oy++) { + for (int ox = 0; ox < pre_outW; ox++) { + pre_input[d * pre_d2_pitch + oy * pre_d1_pitch + ox] = + orig[d * dim2_pitch + (stride * oy) * dim1_pitch + stride * ox]; + } + } + } + + // Update tile_input to point to pre-subsampled data + XAI_TILE3D_SET_BUFF_PTR(&tile_input, pre_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, pre_buf_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, pre_input); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, pre_d1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, pre_d2_pitch); + XAI_TILE3D_SET_DIM1(&tile_input, pre_outW); + XAI_TILE3D_SET_DIM2(&tile_input, pre_outH); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0); + } + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + xai_array tile_outscale; + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + // If pre-strided, override stride to 1 → dispatch to 1x1j1d1 (no gather) + if (need_prestride) { + XAI_CNN_CONV_SET_STRIDEX(¶ms, 1); + XAI_CNN_CONV_SET_STRIDEY(¶ms, 1); + } else { + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + } + + // ======================================================================== + // Execute convolution using generic system-memory API + // This version accesses data through the processor cache + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, + &tile_outscale, &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} + +// ============================================================================ +// Non-VQ (per-tensor output scaling) versions +// ============================================================================ + +// Non-VQ DMA version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_1x1j2d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed) + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data (no outscale) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, config->in_rows_firstdma - config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias for N-tile (no outscale) + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + + // ================================================================ + // Prefetch Next Input Tile + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm(1, + &(src[(config->stride_y * config->output_rows * temp_idx_h) * config->src_dim1_size]), + &(p_input1[0]), + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->input_rows, + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &(p_input0[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Convolution (non-VQ API) + // ================================================================ + XAI_ERR_TYPE status = xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // Prefetch next coefficient tile + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// Non-VQ cache version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_1x1j2d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (SIMD-aligned dim1_pitch required) + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // Copy raw input to padded buffer and extend edges + xaiCopyTile3D(&src_raw, &tile_input, true); + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Pre-subsample input when numInCh > 64 (gather instruction workaround) + // The XAI 1x1j2d1 kernel uses gather instructions for numInCh > 64. + // Gather only works on local DRAM, not system memory (cache mode). + // Fix: apply stride manually, then call with stride=1 → dispatches to + // 1x1j1d1 kernel which has an aligned path that uses regular loads. + // ======================================================================== + int need_prestride = (config->src_dim3_size > 2 * XCHAL_IVPN_SIMD_WIDTH) + && (config->stride_x > 1); + if (need_prestride) { + int pre_outW = config->dst_dim1_size; + int pre_outH = config->dst_dim2_size; + int pre_d1_pitch = (pre_outW + 2*XCHAL_IVPN_SIMD_WIDTH - 1) + & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int pre_d2_pitch = pre_d1_pitch * pre_outH; + int pre_buf_size = pre_d2_pitch * config->src_dim3_size; + int stride = config->stride_x; + + // Place pre-subsampled data after padded input (128-byte aligned) + int pre_offset = (input_buffer_size + 127) & ~127; + int8_t* pre_input = &padded_input[pre_offset]; + + if ((pre_offset + pre_buf_size) > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(pre_input, config->input_zero_point, pre_buf_size); + + // Subsample: pick every stride-th pixel in W and H + int8_t* orig = &padded_input[data_offset]; + for (int d = 0; d < config->src_dim3_size; d++) { + for (int oy = 0; oy < pre_outH; oy++) { + for (int ox = 0; ox < pre_outW; ox++) { + pre_input[d * pre_d2_pitch + oy * pre_d1_pitch + ox] = + orig[d * dim2_pitch + (stride * oy) * dim1_pitch + stride * ox]; + } + } + } + + // Update tile_input to point to pre-subsampled data + XAI_TILE3D_SET_BUFF_PTR(&tile_input, pre_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, pre_buf_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, pre_input); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, pre_d1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, pre_d2_pitch); + XAI_TILE3D_SET_DIM1(&tile_input, pre_outW); + XAI_TILE3D_SET_DIM2(&tile_input, pre_outH); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, 0); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, 0); + } + + // ======================================================================== + // Configure Coefficient Tile Descriptor + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + // If pre-strided, override stride to 1 → dispatch to 1x1j1d1 (no gather) + if (need_prestride) { + XAI_CNN_CONV_SET_STRIDEX(¶ms, 1); + XAI_CNN_CONV_SET_STRIDEY(¶ms, 1); + } else { + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + } + + // ======================================================================== + // Execute convolution (non-VQ API) + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, + &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} \ No newline at end of file diff --git a/backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c b/backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c new file mode 100644 index 00000000000..381e7ead66c --- /dev/null +++ b/backends/cadence/vision/operators/conv/conv_exec_3x3j1d1.c @@ -0,0 +1,1030 @@ +#include "kernel_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include "utils.h" +#include +#include +#include + +// VQ (per-channel output scaling) DMA version +XAI_ERR_TYPE conv_exec_3x3j1d1VQ( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug. + // See FUNCTIONALITY_FIXES.md §2 for details. + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, + config->outscale_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias || !p_outscale) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_array tile_outscale; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + // Transfer constant data (all buffers are 64-byte aligned by test harness) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at data offset + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias + outscale on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1)); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias/outscale for N-tile (matching convIdma.c line 649) + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles (matching convIdma.c lines 664-728) + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // Calculate actual rows for this height tile (handle last tile edge case) + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm( /*ch */ 1, + /*src */ (void*)&(src[max(((config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]), + /*dst */ (void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]), + /*src_row_pitch */ config->src_dim1_pitch, + /*dst_row_pitch */ config->in_dim1_pitch, + /*src_tile_pitch */ config->src_dim2_pitch, + /*dst_tile_pitch */ config->in_dim2_pitch, + /*row_sz */ config->src_dim1_size, + /*nrows */ min(((config->stride_y * config->output_rows)*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((-(config->stride_y * config->output_rows))*(temp_idx_h)+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)), + /*ntiles */ config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration (matching convIdma.c lines 694-700) + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + // Note: DIM2 stays constant at (in_rows_firstdma - edge1) - set once above + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Edge Extension and Convolution + // ================================================================ + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input); + + XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD(&(tile_input), + &(tile_coeff), + &(tile_bias), &(tile_outscale), + &(tile_output), &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // ================================================================ + // Prefetch next coefficient tile (if needed) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + + inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles , inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + + dma_1dm(0,/* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), /* dst */ &(p_coeff[0]), /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory (matching convIdma.c lines 718-724) + // ================================================================ + // Calculate actual output bytes for this height tile + // Last height tile may have fewer rows than output_rows + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + + dma_2dm( 0, + /* src */ &(p_output1[0]), + /* dst */ &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))], + /* src stride 2d */ config->out_dim2_pitch, + /* dst stride 2d */ config->dst_dim2_pitch, + /* row size */ output_row_bytes, + /* count 2d */ current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// VQ (per-channel output scaling) cache version +// All data stays in system memory and is accessed through processor cache +XAI_ERR_TYPE conv_exec_3x3j1d1VQ_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (shared across cache kernels) + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + // Get shared padded input buffer from allocator + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + // Zero-fill the padded buffer + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // ======================================================================== + // Copy raw input to padded buffer and extend edges + // ======================================================================== +#ifdef USE_DMA_FOR_CACHE_COPY + // Use DMA 3D transfer to copy input data into padded buffer at data_offset + dma_3dm(0, + /* src */ src, + /* dst */ &padded_input[data_offset], + /* src_row_pitch */ config->src_dim1_pitch, + /* dst_row_pitch */ dim1_pitch, + /* src_tile_pitch */ config->src_dim2_pitch, + /* dst_tile_pitch */ dim2_pitch, + /* row_sz */ config->src_dim1_size, + /* nrows */ config->src_dim2_size, + /* ntiles */ config->src_dim3_size); +#else + // Use library tile copy function (no DMA required) + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; +#endif + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + xai_array tile_outscale; + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor (points to system memory) + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution using generic system-memory API + // This version accesses data through the processor cache + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, + &tile_outscale, &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} + +// ============================================================================ +// Non-VQ (per-tensor output scaling) versions +// ============================================================================ + +// Non-VQ DMA version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_3x3j1d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed) + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug. + // See FUNCTIONALITY_FIXES.md §2 for details. + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data (no outscale) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at data offset + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1)); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias for N-tile (no outscale) + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // Calculate actual rows for this height tile + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm(1, + (void*)&(src[max(((config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]), + (void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]), + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + min(((config->stride_y * config->output_rows)*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((-(config->stride_y * config->output_rows))*(temp_idx_h)+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)), + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Edge Extension and Convolution (non-VQ API) + // ================================================================ + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input); + + XAI_ERR_TYPE status = xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // ================================================================ + // Prefetch next coefficient tile (if needed) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// Non-VQ cache version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_3x3j1d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // Copy raw input to padded buffer + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution (non-VQ API) + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, + &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} \ No newline at end of file diff --git a/backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c b/backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c new file mode 100644 index 00000000000..62f1fcb18a4 --- /dev/null +++ b/backends/cadence/vision/operators/conv/conv_exec_3x3j2d1.c @@ -0,0 +1,1028 @@ +#include "kernel_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include "utils.h" +#include +#include +#include + +// VQ (per-channel output scaling) DMA version +XAI_ERR_TYPE conv_exec_3x3j2d1VQ( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug. + // See FUNCTIONALITY_FIXES.md §2 for details. + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, + config->outscale_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias || !p_outscale) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_array tile_outscale; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data (all buffers are 64-byte aligned by test harness) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at data offset + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias + outscale on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1)); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias/outscale for N-tile + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // Calculate actual rows for this height tile (handle last tile edge case) + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + int current_row_size = config->dst_dim1_size * current_output_rows; + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm( /*ch */ 1, + /*src */ (void*)&(src[max((( config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]), + /*dst */ (void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]), + /*src_row_pitch */ config->src_dim1_pitch, + /*dst_row_pitch */ config->in_dim1_pitch, + /*src_tile_pitch */ config->src_dim2_pitch, + /*dst_tile_pitch */ config->in_dim2_pitch, + /*row_sz */ config->src_dim1_size, + /*nrows */ min(((config->stride_y * config->output_rows )*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((((-(config->stride_y * config->output_rows )))*(temp_idx_h))+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)), + /*ntiles */ config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Edge Extension and Convolution + // ================================================================ + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input); + + XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD(&(tile_input), + &(tile_coeff), + &(tile_bias), &(tile_outscale), + &(tile_output), &(params)); + + // ================================================================ + // Prefetch next coefficient tile (if needed) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + + inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles , inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + + dma_1dm(0,/* src */ (((coeff_ptr) + ((config->coeff_buffer_size)*(temp_idx_n)))), /* dst */ &(p_coeff[0]), /* row size */ (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + dma_2dm(0, + /* src */ &(p_output1[0]), + /* dst */ &dst[((config->dst_dim2_pitch *config->n_tile_size )*(idx_n))+((config->out_dim2_pitch)*(idx_h))], + /* src stride 2d */ config->out_dim2_pitch, + /* dst stride 2d */ config->dst_dim2_pitch, + /* row size */ current_row_size, + /* count 2d */ current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// VQ (per-channel output scaling) cache version +// All data stays in system memory and is accessed through processor cache +XAI_ERR_TYPE conv_exec_3x3j2d1VQ_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (shared across cache kernels) + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + // Get shared padded input buffer from allocator + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + // Zero-fill the padded buffer + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // ======================================================================== + // Copy raw input to padded buffer and extend edges + // ======================================================================== +#ifdef USE_DMA_FOR_CACHE_COPY + // Use DMA 3D transfer to copy input data into padded buffer at data_offset + dma_3dm(0, + /* src */ src, + /* dst */ &padded_input[data_offset], + /* src_row_pitch */ config->src_dim1_pitch, + /* dst_row_pitch */ dim1_pitch, + /* src_tile_pitch */ config->src_dim2_pitch, + /* dst_tile_pitch */ dim2_pitch, + /* row_sz */ config->src_dim1_size, + /* nrows */ config->src_dim2_size, + /* ntiles */ config->src_dim3_size); +#else + // Use library tile copy function (no DMA required) + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; +#endif + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + xai_array tile_outscale; + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor (points to system memory) + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution using generic system-memory API + // This version accesses data through the processor cache + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, + &tile_outscale, &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} + +// ============================================================================ +// Non-VQ (per-tensor output scaling) versions +// ============================================================================ + +// Non-VQ DMA version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_3x3j2d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation (no outscale buffer needed) + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + // FIX: Allocate coeff FIRST to avoid address-sensitive FOLD16 bug. + // See FUNCTIONALITY_FIXES.md §2 for details. + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data (no outscale) + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at data offset + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1)); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->output_rows); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->n_tile_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias for N-tile (no outscale) + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // Calculate actual rows for this height tile + int current_output_rows = (idx_h < config->height_tiles - 1) ? + config->output_rows : + (config->dst_dim2_size - (config->output_rows * idx_h)); + int current_row_size = config->dst_dim1_size * current_output_rows; + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + dma_3dm(1, + (void*)&(src[max(((config->stride_y * config->output_rows * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]), + (void*)&(p_input1[max((((-(config->output_rows)* config->in_dim1_pitch))*(temp_idx_h))+(config->in_data_offset),1)]), + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + min(((config->stride_y * config->output_rows)*(temp_idx_h))+(config->input_rows-config->in_dim2_edge1),min((((-(config->stride_y * config->output_rows)))*(temp_idx_h))+(config->src_dim2_size + config->in_dim2_edge2),config->input_rows)), + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->output_rows)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->output_rows)*(idx_h)); + XAI_TILE3D_SET_DIM2(&tile_output, current_output_rows); + + // ================================================================ + // Perform Edge Extension and Convolution (non-VQ API) + // ================================================================ + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input); + + XAI_ERR_TYPE status = xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // ================================================================ + // Prefetch next coefficient tile (if needed) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n),idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + + dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles-1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory + // ================================================================ + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n))+((config->out_dim2_pitch)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + current_row_size, + current_n_size); + + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// Non-VQ cache version - per-tensor output scaling +XAI_ERR_TYPE conv_exec_3x3j2d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // Copy raw input to padded buffer + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution (non-VQ API) + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolved3D(&tile_input, &tile_coeff, &tile_bias, + &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} \ No newline at end of file diff --git a/backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c b/backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c new file mode 100644 index 00000000000..36709da0e78 --- /dev/null +++ b/backends/cadence/vision/operators/conv/conv_exec_7x7j2d1.c @@ -0,0 +1,1088 @@ +#include "kernel_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include "utils.h" +#include +#include +#include + +// conv 7x7j2d1 VQ executor with DMA (per-channel output scaling) +XAI_ERR_TYPE conv_exec_7x7j2d1VQ( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + int8_t* p_outscale = allocate_dram_buffer(config->outscale_buffer_size, + config->outscale_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias || !p_outscale) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_array tile_outscale; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + dma_1dm(0, outScale_ptr, p_outscale, config->outscale_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at data offset + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias + outscale on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1)); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->outscale_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, p_outscale); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor (matches convIdma.c) + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->out_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->out_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->out_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias/outscale for N-tile + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, &p_outscale[config->n_tile_size * 2 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_outscale, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + // Transfer next input tile from system memory to DRAM + // Generalized formulas for variable output_rows + // Key insight: (EDGE+1) in original = stride_y * output_rows + int stride_output = config->stride_y * config->output_rows; + + // Calculate row count for this tile + int row_count; + if (temp_idx_h < (config->height_tiles - 1)) { + // Non-last tiles: min of (progress + buffer_size - edge, remaining_from_end, buffer_size) + int prog_rows = (stride_output * temp_idx_h) + (config->in_dim2_size - config->in_dim2_edge1); + int rem_rows = (config->src_dim2_size + config->in_dim1_edge2) - (stride_output * temp_idx_h); + row_count = min(prog_rows, min(rem_rows, config->in_dim2_size)); + } else { + // Last tile: transfer remaining rows from source (accounting for source starting offset) + int src_start_row = max(stride_output * temp_idx_h - config->in_dim2_edge1, 0); + row_count = config->src_dim2_size - src_start_row; + } + + dma_3dm(1, + (uint64_t *)&(src[max(((stride_output * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]), + (uint64_t *)&(p_input1[config->in_data_offset - min(stride_output * temp_idx_h * config->in_dim1_pitch, config->in_data_offset - config->in_dim2_edge1)]), + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + row_count, + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + // Input vertical coordinate: stride * tile_size * tile_index (matches convIdma.c) + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->out_dim2_size)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->out_dim2_size)*(idx_h)); + + // ================================================================ + // Perform Edge Extension and Convolution + // ================================================================ + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input); + + XAI_ERR_TYPE status = xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_outscale), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // ================================================================ + // Prefetch next coefficient tile (if needed) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles - 1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory (matches convIdma.c formula) + // ================================================================ + // Fix: For the last height tile, only write the valid output rows + // to avoid spilling into the next channel's memory. + { + int current_output_rows = (idx_h < config->height_tiles - 1) + ? config->output_rows + : (config->dst_dim2_size - (config->output_rows * idx_h)); + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n)) + ((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + } + + // Swap ping-pong buffers for next iteration + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +// conv 7x7j2d1 executor with DMA (per-tensor output scaling) +XAI_ERR_TYPE conv_exec_7x7j2d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // SECTION 1: DRAM Buffer Allocation + // ======================================================================== + int dram0_used = 0; + int dram1_used = 0; + + int8_t* p_input0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_input1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_coeff = allocate_dram_buffer(config->coeff_buffer_size, + config->coeff_dram, + &dram0_used, &dram1_used); + int8_t* p_output0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* p_output1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + int8_t* p_bias = allocate_dram_buffer(config->bias_buffer_size, + config->bias_dram, + &dram0_used, &dram1_used); + + if (!p_input0 || !p_input1 || !p_coeff || + !p_output0 || !p_output1 || !p_bias) { + return (-1); + } + + // ======================================================================== + // SECTION 2: Initialize XAI Tile Descriptors + // ======================================================================== + xai_tile3D tile_input; + xai_size3D frame_size_input; + xai_tile4D tile_coeff; + xai_array tile_bias; + xai_tile3D tile_output; + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + + /* Initialize DMA engines */ + dma_3dm_init(1); + dma_2dm_init(0); + + // Transfer constant data + dma_1dm(0, coeff_ptr, p_coeff, config->coeff_buffer_size); + dma_1dm(0, bias_ptr, p_bias, config->bias_buffer_size); + + // Initialize input buffer and load first tile + _proto_FillBuffer_I8(p_input0, config->input_zero_point, config->input_buffer_size); + + // First DMA: load IN_ROWS_FIRSTDMA rows at data offset + dma_3dm(1, + (void*)src, + (void*)&p_input0[config->in_data_offset], + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + config->in_rows_firstdma, + config->src_dim3_size); + + // Wait for all initial DMA transfers to complete + idma_hw_wait_all(0); // coeff + bias on ch0 + idma_hw_wait_all(1); // input on ch1 + + // ======================================================================== + // Configure Input Tile Descriptor + // ======================================================================== + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, config->input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, config->in_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, config->in_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2(&tile_input, (config->in_rows_firstdma - config->in_dim2_edge1)); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, config->in_dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, config->in_dim3_edge2); + + // Frame size for edge extension + frame_size_input.dim1Size = config->src_dim1_size; + frame_size_input.dim2Size = config->src_dim2_size; + frame_size_input.dim3Size = config->src_dim3_size; + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_buffer_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, p_coeff); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->n_tile_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->bias_buffer_size); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, p_bias); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->n_tile_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->n_tile_size); + + // ======================================================================== + // Configure Output Tile Descriptor (matches convIdma.c) + // ======================================================================== + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->output_buffer_size); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->out_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->out_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->out_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->out_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->out_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + + //print config eg . config->accum_shift, config->dilation, config->flags, config->output_scale, config->output_shift, config->relu_max, config->relu_min, config->stride_x, config->stride_y + // ======================================================================== + // SECTION 3: Tiled Execution Loop (N-tiles × H-tiles) + // ======================================================================== + int last_tile = 1; + + for (int idx_n = 0; idx_n < config->n_tiles; idx_n++) { + int last_n_tile = (last_tile) && (idx_n == config->n_tiles - 1); + int current_n_size = (idx_n < config->n_tiles - 1) ? + config->n_tile_size : config->n_tile_size_last; + + // Update coefficient/bias for N-tile + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, config->n_tile_size * idx_n); + XAI_TILE4D_SET_DIM4(&tile_coeff, current_n_size); + + XAI_ARRAY_SET_DATA_PTR(&tile_bias, &p_bias[config->n_tile_size * 4 * idx_n]); + XAI_ARRAY_SET_WIDTH(&tile_bias, current_n_size); + XAI_ARRAY_SET_CAPACITY(&tile_bias, current_n_size); + + XAI_TILE3D_SET_DIM3_COORD(&tile_output, config->n_tile_size * idx_n); + XAI_TILE3D_SET_DIM3(&tile_output, current_n_size); + + // Process vertical tiles + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_n_tile) && (idx_h == config->height_tiles - 1); + + // ================================================================ + // Prefetch Next Input Tile (Ping-Pong Buffering) + // ================================================================ + if (!last_h_tile) { + int temp_idx_h; + inc_iter_to_temp(&temp_idx_h, idx_h, config->height_tiles, 1); + _proto_FillBuffer_I8(p_input1, config->input_zero_point, config->input_buffer_size); + + // Transfer next input tile from system memory to DRAM + // Generalized formulas for variable output_rows + // Key insight: (EDGE+1) in original = stride_y * output_rows + int stride_output = config->stride_y * config->output_rows; + + // Calculate row count for this tile + int row_count; + if (temp_idx_h < (config->height_tiles - 1)) { + // Non-last tiles: min of (progress + buffer_size - edge, remaining_from_end, buffer_size) + int prog_rows = (stride_output * temp_idx_h) + (config->in_dim2_size - config->in_dim2_edge1); + int rem_rows = (config->src_dim2_size + config->in_dim1_edge2) - (stride_output * temp_idx_h); + row_count = min(prog_rows, min(rem_rows, config->in_dim2_size)); + } else { + // Last tile: transfer remaining rows from source (accounting for source starting offset) + int src_start_row = max(stride_output * temp_idx_h - config->in_dim2_edge1, 0); + row_count = config->src_dim2_size - src_start_row; + } + + dma_3dm(1, + (uint64_t *)&(src[max(((stride_output * temp_idx_h - config->in_dim2_edge1) * config->src_dim1_size),0)]), + (uint64_t *)&(p_input1[config->in_data_offset - min(stride_output * temp_idx_h * config->in_dim1_pitch, config->in_data_offset - config->in_dim2_edge1)]), + config->src_dim1_pitch, + config->in_dim1_pitch, + config->src_dim2_pitch, + config->in_dim2_pitch, + config->src_dim1_size, + row_count, + config->src_dim3_size); + } + + // ================================================================ + // Update Tile Descriptors for Current Iteration + // ================================================================ + XAI_TILE3D_SET_BUFF_PTR(&tile_input, p_input0); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &p_input0[config->in_data_offset]); + // Input vertical coordinate: stride * tile_size * tile_index (matches convIdma.c) + XAI_TILE3D_SET_DIM2_COORD(&tile_input, (config->stride_y * config->out_dim2_size)*(idx_h)); + XAI_TILE3D_SET_BUFF_PTR(&tile_output, p_output1); + XAI_TILE3D_SET_DATA_PTR(&tile_output, &(p_output1[0])); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, (config->out_dim2_size)*(idx_h)); + + // ================================================================ + // Perform Edge Extension and Convolution + // ================================================================ + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size_input); + + XAI_ERR_TYPE status = xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD( + &(tile_input), + &(tile_coeff), + &(tile_bias), + &(tile_output), + &(params)); + + if (status != XAI_ERR_OK) { + return status; + } + + // ================================================================ + // Prefetch next coefficient tile (if needed) + // ================================================================ + if ((!(last_h_tile)) && ((idx_h) == (config->height_tiles - 1))) { + int temp_idx_n; + int temp_idx_h; + inc_iter_to_temp(&(temp_idx_n), idx_n, config->n_tiles, inc_iter_to_temp(&(temp_idx_h), idx_h, config->height_tiles, 1)); + dma_1dm(0, (coeff_ptr + (config->coeff_buffer_size * temp_idx_n)), &(p_coeff[0]), (((temp_idx_n) < (config->n_tiles - 1))?(config->coeff_buffer_size):(config->coeff_dim1_size * config->coeff_dim2_size * config->coeff_dim3_size * config->n_tile_size_last))); + } + + // ================================================================ + // Write Output Tile to System Memory (matches convIdma.c formula) + // ================================================================ + // Fix: For the last height tile, only write the valid output rows + // to avoid spilling into the next channel's memory. + { + int current_output_rows = (idx_h < config->height_tiles - 1) + ? config->output_rows + : (config->dst_dim2_size - (config->output_rows * idx_h)); + int output_row_bytes = config->out_dim1_pitch * current_output_rows; + dma_2dm(0, + &(p_output1[0]), + &dst[((config->dst_dim2_pitch * config->n_tile_size)*(idx_n)) + ((config->out_dim1_pitch * config->output_rows)*(idx_h))], + config->out_dim2_pitch, + config->dst_dim2_pitch, + output_row_bytes, + current_n_size); + } + + // Swap ping-pong buffers for next iteration + swap_buffers(&(p_output0), &(p_output1)); + swap_buffers(&(p_input0), &(p_input1)); + } + } + + // Wait for final output DMA to complete before returning + idma_hw_wait_all(0); + return XAI_ERR_OK; +} + +// conv 7x7j2d1 executor with caching (no DMA) +// All data stays in system memory and is accessed through processor cache +XAI_ERR_TYPE conv_exec_7x7j2d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (shared across cache kernels) + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + // Get shared padded input buffer from allocator + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + // Zero-fill the padded buffer + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // ======================================================================== + // Copy raw input to padded buffer and extend edges + // ======================================================================== +#ifdef USE_DMA_FOR_CACHE_COPY + // Use DMA 3D transfer to copy input data into padded buffer at data_offset + dma_3dm(0, + /* src */ src, + /* dst */ &padded_input[data_offset], + /* src_row_pitch */ config->src_dim1_pitch, + /* dst_row_pitch */ dim1_pitch, + /* src_tile_pitch */ config->src_dim2_pitch, + /* dst_tile_pitch */ dim2_pitch, + /* row_sz */ config->src_dim1_size, + /* nrows */ config->src_dim2_size, + /* ntiles */ config->src_dim3_size); +#else + // Use library tile copy function (no DMA required) + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; +#endif + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + + // ======================================================================== + // Configure Output Tile Descriptor (points to system memory) + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution using specific optimized kernel directly + // (bypasses xaiConvolved3D dispatcher for deterministic variant selection) + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD( + &tile_input, &tile_coeff, &tile_bias, + &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} + + +XAI_ERR_TYPE conv_exec_7x7j2d1VQ_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + int8_t* outScale_ptr, + const conv_layer_config_t* config) +{ + // ======================================================================== + // Setup source raw tile descriptor (points to raw input without padding) + // ======================================================================== + xai_tile3D src_raw; + XAI_TILE3D_SET_BUFF_PTR(&src_raw, src); + XAI_TILE3D_SET_BUFF_SIZE(&src_raw, config->src_dim2_pitch * config->src_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&src_raw, src); + XAI_TILE3D_SET_DATA_ORDER(&src_raw, XAI_WHD); + XAI_TILE3D_SET_TYPE(&src_raw, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&src_raw, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&src_raw, 0); + XAI_TILE3D_SET_DIM1_PITCH(&src_raw, config->src_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&src_raw, config->src_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM1(&src_raw, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM2_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM2(&src_raw, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&src_raw, 0); + XAI_TILE3D_SET_DIM3_COORD(&src_raw, 0); + XAI_TILE3D_SET_DIM3(&src_raw, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&src_raw, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&src_raw, 0); + + // ======================================================================== + // Get padded input buffer from allocator (shared across cache kernels) + // ======================================================================== + int padded_dim1 = config->src_dim1_size + config->in_dim1_edge1 + config->in_dim1_edge2; + int dim1_pitch = (padded_dim1 + 2*XCHAL_IVPN_SIMD_WIDTH - 1) & ~(2*XCHAL_IVPN_SIMD_WIDTH - 1); + int padded_dim2 = config->src_dim2_size + config->in_dim2_edge1 + config->in_dim2_edge2; + int dim2_pitch = dim1_pitch * padded_dim2; + int input_buffer_size = dim2_pitch * config->src_dim3_size; + + // Get shared padded input buffer from allocator + int8_t* padded_input = get_cache_padded_input(); + + if (input_buffer_size > (int)get_cache_padded_input_size()) { + return XAI_ERR_DATASIZE; + } + + // Zero-fill the padded buffer + memset(padded_input, config->input_zero_point, input_buffer_size); + + // ======================================================================== + // Setup padded input tile descriptor + // ======================================================================== + int data_offset = (config->in_dim2_edge1 * dim1_pitch) + config->in_dim1_edge1; + + xai_tile3D tile_input; + XAI_TILE3D_SET_BUFF_PTR(&tile_input, padded_input); + XAI_TILE3D_SET_BUFF_SIZE(&tile_input, input_buffer_size); + XAI_TILE3D_SET_DATA_PTR(&tile_input, &padded_input[data_offset]); + XAI_TILE3D_SET_DATA_ORDER(&tile_input, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_input, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_input, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_input, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_input, dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_input, dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM1(&tile_input, config->src_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_input, config->in_dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_input, config->in_dim1_edge2); + XAI_TILE3D_SET_DIM2_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM2(&tile_input, config->src_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_input, config->in_dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_input, config->in_dim2_edge2); + XAI_TILE3D_SET_DIM3_COORD(&tile_input, 0); + XAI_TILE3D_SET_DIM3(&tile_input, config->src_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_input, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_input, 0); + + // ======================================================================== + // Copy raw input to padded buffer and extend edges + // ======================================================================== +#ifdef USE_DMA_FOR_CACHE_COPY + // Use DMA 3D transfer to copy input data into padded buffer at data_offset + dma_3dm(0, + /* src */ src, + /* dst */ &padded_input[data_offset], + /* src_row_pitch */ config->src_dim1_pitch, + /* dst_row_pitch */ dim1_pitch, + /* src_tile_pitch */ config->src_dim2_pitch, + /* dst_tile_pitch */ dim2_pitch, + /* row_sz */ config->src_dim1_size, + /* nrows */ config->src_dim2_size, + /* ntiles */ config->src_dim3_size); +#else + // Use library tile copy function (no DMA required) + // Safe manual copy: avoids SIMD overread near source buffer boundary + for (int d = 0; d < config->src_dim3_size; d++) { + for (int h = 0; h < config->src_dim2_size; h++) { + memcpy(&padded_input[data_offset + d * dim2_pitch + h * dim1_pitch], + &src[d * config->src_dim2_pitch + h * config->src_dim1_pitch], + config->src_dim1_size); + } + } + (void)src_raw; +#endif + + xai_size3D frame_size; + frame_size.dim1Size = config->dst_dim1_size * config->stride_x; + frame_size.dim2Size = config->dst_dim2_size * config->stride_y; + frame_size.dim3Size = config->src_dim3_size; + + xaiExtendEdgesConst3D_I8(&tile_input, config->input_zero_point, frame_size); + + // ======================================================================== + // Configure Coefficient Tile Descriptor (4D: W×H×C×N) + // ======================================================================== + xai_tile4D tile_coeff; + XAI_TILE4D_SET_BUFF_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_BUFF_SIZE(&tile_coeff, config->coeff_dim3_pitch * config->dst_dim3_size); + XAI_TILE4D_SET_DATA_PTR(&tile_coeff, coeff_ptr); + XAI_TILE4D_SET_DATA_ORDER(&tile_coeff, XAI_WHDN); + XAI_TILE4D_SET_TYPE(&tile_coeff, XAI_TILE4D_S8); + XAI_TILE4D_SET_FRAME_PTR(&tile_coeff, 0); + XAI_TILE4D_SET_STATUS_FLAGS(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_PITCH(&tile_coeff, config->coeff_dim1_pitch); + XAI_TILE4D_SET_DIM2_PITCH(&tile_coeff, config->coeff_dim2_pitch); + XAI_TILE4D_SET_DIM3_PITCH(&tile_coeff, config->coeff_dim3_pitch); + XAI_TILE4D_SET_DIM1_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1(&tile_coeff, config->coeff_dim1_size); + XAI_TILE4D_SET_DIM1_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM1_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2(&tile_coeff, config->coeff_dim2_size); + XAI_TILE4D_SET_DIM2_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM2_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3(&tile_coeff, config->coeff_dim3_size); + XAI_TILE4D_SET_DIM3_EDGE1(&tile_coeff, 0); + XAI_TILE4D_SET_DIM3_EDGE2(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4_COORD(&tile_coeff, 0); + XAI_TILE4D_SET_DIM4(&tile_coeff, config->dst_dim3_size); + + // ======================================================================== + // Configure Bias Array + // ======================================================================== + xai_array tile_bias; + XAI_ARRAY_SET_BUFF_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_bias, config->dst_dim3_size * 4); + XAI_ARRAY_SET_DATA_PTR(&tile_bias, bias_ptr); + XAI_ARRAY_SET_WIDTH(&tile_bias, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_bias, 1); + XAI_ARRAY_SET_TYPE(&tile_bias, XAI_ARRAY_S32); + XAI_ARRAY_SET_CAPACITY(&tile_bias, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Scale Array + // ======================================================================== + xai_array tile_outscale; + XAI_ARRAY_SET_BUFF_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_BUFF_SIZE(&tile_outscale, config->dst_dim3_size * 2); + XAI_ARRAY_SET_DATA_PTR(&tile_outscale, outScale_ptr); + XAI_ARRAY_SET_WIDTH(&tile_outscale, config->dst_dim3_size); + XAI_ARRAY_SET_HEIGHT(&tile_outscale, 1); + XAI_ARRAY_SET_TYPE(&tile_outscale, XAI_ARRAY_U16); + XAI_ARRAY_SET_CAPACITY(&tile_outscale, config->dst_dim3_size); + + // ======================================================================== + // Configure Output Tile Descriptor (points to system memory) + // ======================================================================== + xai_tile3D tile_output; + XAI_TILE3D_SET_BUFF_PTR(&tile_output, dst); + XAI_TILE3D_SET_BUFF_SIZE(&tile_output, config->dst_dim2_pitch * config->dst_dim3_size); + XAI_TILE3D_SET_DATA_PTR(&tile_output, dst); + XAI_TILE3D_SET_DATA_ORDER(&tile_output, XAI_WHD); + XAI_TILE3D_SET_TYPE(&tile_output, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(&tile_output, 0); + XAI_TILE3D_SET_STATUS_FLAGS(&tile_output, 0); + XAI_TILE3D_SET_DIM1_PITCH(&tile_output, config->dst_dim1_pitch); + XAI_TILE3D_SET_DIM2_PITCH(&tile_output, config->dst_dim2_pitch); + XAI_TILE3D_SET_DIM1_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM1(&tile_output, config->dst_dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM1_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM2_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM2(&tile_output, config->dst_dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM2_EDGE2(&tile_output, 0); + XAI_TILE3D_SET_DIM3_COORD(&tile_output, 0); + XAI_TILE3D_SET_DIM3(&tile_output, config->dst_dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(&tile_output, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&tile_output, 0); + + // ======================================================================== + // Configure Convolution Parameters + // ======================================================================== + xai_cnn_conv_params params; + memset(¶ms, 0, sizeof(params)); + XAI_CNN_CONV_SET_ACCUM_SHIFT(¶ms, config->accum_shift); + XAI_CNN_CONV_SET_DILATION(¶ms, config->dilation); + XAI_CNN_CONV_SET_FLAGS(¶ms, config->flags); + XAI_CNN_CONV_SET_OUTPUT_SCALE(¶ms, config->output_scale); + XAI_CNN_CONV_SET_OUTPUT_SHIFT(¶ms, config->output_shift); + XAI_CNN_CONV_SET_RELU_MAX(¶ms, config->relu_max); + XAI_CNN_CONV_SET_RELU_MIN(¶ms, config->relu_min); + XAI_CNN_CONV_SET_STRIDEX(¶ms, config->stride_x); + XAI_CNN_CONV_SET_STRIDEY(¶ms, config->stride_y); + + // ======================================================================== + // Execute convolution using generic system-memory API + // This version accesses data through the processor cache + // ======================================================================== + XAI_ERR_TYPE status = xaiConvolvedVQ3D(&tile_input, &tile_coeff, &tile_bias, + &tile_outscale, &tile_output, ¶ms); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(dst, config->dst_dim2_pitch * config->dst_dim3_size); + + return status; +} diff --git a/backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c b/backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c new file mode 100644 index 00000000000..b9c5c326326 --- /dev/null +++ b/backends/cadence/vision/operators/conv/conv_kernel_dispatcher.c @@ -0,0 +1,50 @@ +/* + * conv_kernel_dispatcher.c + * + * Created on: Dec 8, 2025 + * Author: Suraj Raut + * + * Description: + * Dispatcher that routes convolution execution to kernel-specific executors. + * Each kernel type has its own source file with exact DMA formulas from convIdma.c. + */ + +#include "kernel_executors.h" +#include + +/** + * Dispatch to appropriate kernel executor based on config->kernel_name + */ +XAI_ERR_TYPE conv_execute_kernel( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config) +{ + // Dispatch to kernel-specific executor + if (strcmp(config->kernel_name, "7x7j2d1_dma") == 0) { + return conv_exec_7x7j2d1(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "7x7j2d1_no_dma") == 0) { + return conv_exec_7x7j2d1_cache(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "3x3j1d1_dma") == 0) { + return conv_exec_3x3j1d1(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "3x3j1d1_no_dma") == 0) { + return conv_exec_3x3j1d1_cache(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "3x3j2d1_dma") == 0) { + return conv_exec_3x3j2d1(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "3x3j2d1_no_dma") == 0) { + return conv_exec_3x3j2d1_cache(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "1x1j2d1_dma") == 0) { + return conv_exec_1x1j2d1(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "1x1j2d1_no_dma") == 0) { + return conv_exec_1x1j2d1_cache(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "1x1j1d1_dma") == 0) { + return conv_exec_1x1j1d1(src, dst, coeff_ptr, bias_ptr, config); + } else if (strcmp(config->kernel_name, "1x1j1d1_no_dma") == 0) { + return conv_exec_1x1j1d1_cache(src, dst, coeff_ptr, bias_ptr, config); + } else { + return XAI_ERR_BADARG; + } +} + diff --git a/backends/cadence/vision/operators/conv/kernel_executors.h b/backends/cadence/vision/operators/conv/kernel_executors.h new file mode 100644 index 00000000000..5cd9c27d818 --- /dev/null +++ b/backends/cadence/vision/operators/conv/kernel_executors.h @@ -0,0 +1,137 @@ +/* + * kernel_executors.h + * + * Created on: Dec 8, 2025 + * Author: Suraj Raut + * + * Description: + * Header file declaring kernel-specific executor functions. + * Each kernel (7x7j2d1, 3x3j1d1, 3x3j2d1, 1x1j2d1, 1x1j1d1) has its own + * executor with exact DMA formulas matching convIdma.c reference. + * + * Non-VQ versions use per-tensor quantization (no outScale_ptr parameter). + */ + +#ifndef KERNEL_EXECUTORS_H_ +#define KERNEL_EXECUTORS_H_ + +#include "../layer_configs.h" + +/* + * XAI error type: Use actual library type if available, otherwise define locally. + * The actual xai_cnn_api.h should be included by implementation files. + */ +#ifndef XAI_ERR_TYPE +typedef int XAI_ERR_TYPE; +#define XAI_ERR_OK 0 +#define XAI_ERR_BADARG 4 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Execute 7x7 stride-2 convolution with DMA (per-tensor output scaling) + */ +XAI_ERR_TYPE conv_exec_7x7j2d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +/** + * Execute 3x3 stride-1 convolution (standard ResNet 3x3 layers) + */ +XAI_ERR_TYPE conv_exec_3x3j1d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +/** + * Execute 3x3 stride-2 convolution (downsampling layers) + */ +XAI_ERR_TYPE conv_exec_3x3j2d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +/** + * Execute 1x1 stride-2 convolution (projection layers for downsampling) + */ +XAI_ERR_TYPE conv_exec_1x1j2d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +/** + * Execute 1x1 stride-1 convolution (bottleneck layers) + */ +XAI_ERR_TYPE conv_exec_1x1j1d1( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +/*============================================================================ + * Cache-based executors (no DMA, uses processor cache) + *============================================================================*/ + +XAI_ERR_TYPE conv_exec_7x7j2d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +XAI_ERR_TYPE conv_exec_3x3j1d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +XAI_ERR_TYPE conv_exec_3x3j2d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +XAI_ERR_TYPE conv_exec_1x1j2d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +XAI_ERR_TYPE conv_exec_1x1j1d1_cache( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +/** + * Dispatch to appropriate kernel executor based on config->kernel_name + */ +XAI_ERR_TYPE conv_execute_kernel( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); + +#ifdef __cplusplus +} +#endif + +#endif /* KERNEL_EXECUTORS_H_ */ diff --git a/backends/cadence/vision/operators/layer_configs.h b/backends/cadence/vision/operators/layer_configs.h new file mode 100644 index 00000000000..3f47d4533eb --- /dev/null +++ b/backends/cadence/vision/operators/layer_configs.h @@ -0,0 +1,2403 @@ +/* + * layer_configs.h + * + * Auto-generated conv2d + maxpool layer configurations + * Generated from PTE extraction by generate_combined_configs.py + * + * DO NOT EDIT MANUALLY + */ + +#ifndef LAYER_CONFIGS_H +#define LAYER_CONFIGS_H + +#include +#include /* for NULL */ + +#define IDMA_BUFFER_SIZE_DRAM0 (62976) /* 61 KB */ +#define IDMA_BUFFER_SIZE_DRAM1 (62976) /* 61 KB */ + +/* ====================================================================== */ +/* Conv2d configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* kernel_name; + const char* config_key; + + int src_dim1_size; int src_dim2_size; int src_dim3_size; + int src_dim1_pitch; int src_dim2_pitch; + + int dst_dim1_size; int dst_dim2_size; int dst_dim3_size; + int dst_dim1_pitch; int dst_dim2_pitch; + + int in_dim1_size; int in_dim1_pitch; + int in_dim2_size; int in_dim2_pitch; + int in_dim1_edge1; int in_dim1_edge2; + int in_dim2_edge1; int in_dim2_edge2; + int in_dim3_edge1; int in_dim3_edge2; + int in_data_offset; int in_rows_firstdma; + + int out_dim1_size; int out_dim1_pitch; + int out_dim2_size; int out_dim2_pitch; + int out_dim3_size; + + int coeff_dim1_size; int coeff_dim2_size; + int coeff_dim3_size; int coeff_dim4_size; + int coeff_dim1_pitch; int coeff_dim2_pitch; int coeff_dim3_pitch; + + int bias_dim1_size; int bias_dim2_size; + int outscale_dim1_size; int outscale_dim2_size; + + int input_buffer_size; int coeff_buffer_size; int output_buffer_size; + int bias_buffer_size; int outscale_buffer_size; + + int input_ping_dram; int input_pong_dram; int coeff_dram; + int output_ping_dram; int output_pong_dram; + int bias_dram; int outscale_dram; + + int n_tile_size; int n_tiles; int n_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int kernel_w; int kernel_h; + int stride_x; int stride_y; + int padding; int dilation; + int accum_shift; int relu_max; int relu_min; + int output_shift; int output_scale; int flags; + int input_zero_point; +} conv_layer_config_t; + +#define NUM_CONV_LAYERS 29 + +static const conv_layer_config_t CONV_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "conv_7x7_s2_ic3_oc64", + .kernel_name = "7x7j2d1_dma", + .config_key = "3_64_64_64_7_7_32_32_2_2_3_1", + .src_dim1_size = 64, + .src_dim2_size = 64, + .src_dim3_size = 3, + .src_dim1_pitch = 64, + .src_dim2_pitch = 4096, + .dst_dim1_size = 32, + .dst_dim2_size = 32, + .dst_dim3_size = 64, + .dst_dim1_pitch = 32, + .dst_dim2_pitch = 1024, + .in_dim1_size = 64, + .in_dim1_pitch = 70, + .in_dim2_size = 35, + .in_dim2_pitch = 2450, + .in_dim1_edge1 = 3, + .in_dim1_edge2 = 3, + .in_dim2_edge1 = 3, + .in_dim2_edge2 = 3, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 213, + .in_rows_firstdma = 32, + .out_dim1_size = 32, + .out_dim1_pitch = 32, + .out_dim2_size = 15, + .out_dim2_pitch = 480, + .out_dim3_size = 64, + .coeff_dim1_size = 7, + .coeff_dim2_size = 7, + .coeff_dim3_size = 3, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 7, + .coeff_dim2_pitch = 49, + .coeff_dim3_pitch = 147, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 7350, + .coeff_buffer_size = 9408, + .output_buffer_size = 30720, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 3, + .output_rows = 15, + .input_rows = 35, + .kernel_w = 7, + .kernel_h = 7, + .stride_x = 2, + .stride_y = 2, + .padding = 3, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 1, + .layer_name = "conv_3x3_s1_ic64_oc64", + .kernel_name = "3x3j1d1_dma", + .config_key = "64_16_16_64_3_3_16_16_1_1_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 18, + .in_dim2_pitch = 324, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 17, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 20736, + .coeff_buffer_size = 36864, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 18, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 2, + .layer_name = "conv_1x1_s2_ic64_oc128", + .kernel_name = "1x1j2d1_dma", + .config_key = "64_16_16_128_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 15, + .in_dim2_pitch = 240, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 15, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 8, + .out_dim2_pitch = 64, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 8192, + .output_buffer_size = 8192, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 8, + .input_rows = 15, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 3, + .layer_name = "conv_3x3_s2_ic64_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "64_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 64, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 576, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5760, + .coeff_buffer_size = 36864, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 4, + .layer_name = "conv_3x3_s1_ic128_oc128", + .kernel_name = "3x3j1d1_dma", + .config_key = "128_8_8_128_3_3_8_8_1_1_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 4, + .in_dim2_pitch = 40, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 5120, + .coeff_buffer_size = 36864, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 5, + .layer_name = "conv_1x1_s2_ic128_oc256", + .kernel_name = "1x1j2d1_dma", + .config_key = "128_8_8_256_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 7, + .in_dim2_pitch = 56, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 4, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 1, + .output_rows = 4, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 6, + .layer_name = "conv_3x3_s2_ic128_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6400, + .coeff_buffer_size = 36864, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 7, + .layer_name = "conv_3x3_s1_ic256_oc256", + .kernel_name = "3x3j1d1_dma", + .config_key = "256_4_4_256_3_3_4_4_1_1_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 4, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 6144, + .coeff_buffer_size = 36864, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 8, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_4_4_512_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 3072, + .coeff_buffer_size = 32768, + .output_buffer_size = 512, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 9, + .layer_name = "conv_3x3_s2_ic256_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 7680, + .coeff_buffer_size = 36864, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 10, + .layer_name = "conv_3x3_s1_ic512_oc512", + .kernel_name = "3x3j1d1_dma", + .config_key = "512_2_2_512_3_3_2_2_1_1_1_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 4, + .in_dim2_size = 4, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 5, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 36864, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 4, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 1, + .stride_y = 1, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 11, + .layer_name = "conv_1x1_s1_ic64_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_256_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 256, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 7, + .out_dim2_pitch = 112, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 7168, + .coeff_buffer_size = 16384, + .output_buffer_size = 28672, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 1, + .n_tile_size_last = 256, + .height_tiles = 3, + .output_rows = 7, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 12, + .layer_name = "conv_1x1_s1_ic64_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "64_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 64, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 16, + .in_dim2_pitch = 256, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 16, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 16, + .out_dim2_pitch = 256, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 64, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 64, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 16384, + .coeff_buffer_size = 4096, + .output_buffer_size = 16384, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 16, + .input_rows = 16, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 13, + .layer_name = "conv_1x1_s1_ic256_oc64", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_64_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 64, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 10, + .in_dim2_pitch = 160, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 10, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 10, + .out_dim2_pitch = 160, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 64, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 64, + .bias_dim2_size = 1, + .outscale_dim1_size = 64, + .outscale_dim2_size = 1, + .input_buffer_size = 40960, + .coeff_buffer_size = 16384, + .output_buffer_size = 10240, + .bias_buffer_size = 256, + .outscale_buffer_size = 128, + .input_ping_dram = 0, + .input_pong_dram = 1, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 1, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 10, + .input_rows = 10, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 14, + .layer_name = "conv_1x1_s2_ic256_oc512", + .kernel_name = "1x1j2d1_dma", + .config_key = "256_16_16_512_1_1_8_8_2_2_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 3, + .in_dim2_pitch = 48, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 2048, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 4, + .n_tile_size_last = 128, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 15, + .layer_name = "conv_1x1_s1_ic256_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_16_16_128_1_1_16_16_1_1_0_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 256, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 16, + .dst_dim2_size = 16, + .dst_dim3_size = 128, + .dst_dim1_pitch = 16, + .dst_dim2_pitch = 256, + .in_dim1_size = 16, + .in_dim1_pitch = 16, + .in_dim2_size = 7, + .in_dim2_pitch = 112, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 7, + .out_dim1_size = 16, + .out_dim1_pitch = 16, + .out_dim2_size = 7, + .out_dim2_pitch = 112, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 28672, + .coeff_buffer_size = 32768, + .output_buffer_size = 14336, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 1, + .n_tile_size_last = 128, + .height_tiles = 3, + .output_rows = 7, + .input_rows = 7, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 16, + .layer_name = "conv_3x3_s2_ic128_oc128", + .kernel_name = "3x3j2d1_dma", + .config_key = "128_16_16_128_3_3_8_8_2_2_1_1", + .src_dim1_size = 16, + .src_dim2_size = 16, + .src_dim3_size = 128, + .src_dim1_pitch = 16, + .src_dim2_pitch = 256, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 16, + .in_dim1_pitch = 18, + .in_dim2_size = 5, + .in_dim2_pitch = 90, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 19, + .in_rows_firstdma = 4, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 32, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 128, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 1152, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 11520, + .coeff_buffer_size = 36864, + .output_buffer_size = 512, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 4, + .n_tile_size_last = 32, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 17, + .layer_name = "conv_1x1_s1_ic128_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "128_8_8_512_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 128, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 512, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 256, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 128, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 128, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 4096, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 256, + .n_tiles = 2, + .n_tile_size_last = 256, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 18, + .layer_name = "conv_1x1_s1_ic512_oc128", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_128_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 128, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 128, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 128, + .bias_dim2_size = 1, + .outscale_dim1_size = 128, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 512, + .outscale_buffer_size = 256, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 2, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 19, + .layer_name = "conv_1x1_s2_ic512_oc1024", + .kernel_name = "1x1j2d1_dma", + .config_key = "512_8_8_1024_1_1_4_4_2_2_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 3, + .in_dim2_pitch = 24, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 512, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 16, + .n_tile_size_last = 64, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 20, + .layer_name = "conv_1x1_s1_ic512_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_8_8_256_1_1_8_8_1_1_0_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 512, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 8, + .dst_dim2_size = 8, + .dst_dim3_size = 256, + .dst_dim1_pitch = 8, + .dst_dim2_pitch = 64, + .in_dim1_size = 8, + .in_dim1_pitch = 8, + .in_dim2_size = 2, + .in_dim2_pitch = 16, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 8, + .out_dim1_pitch = 8, + .out_dim2_size = 2, + .out_dim2_pitch = 16, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 4, + .n_tile_size_last = 64, + .height_tiles = 4, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 21, + .layer_name = "conv_3x3_s2_ic256_oc256", + .kernel_name = "3x3j2d1_dma", + .config_key = "256_8_8_256_3_3_4_4_2_2_1_1", + .src_dim1_size = 8, + .src_dim2_size = 8, + .src_dim3_size = 256, + .src_dim1_pitch = 8, + .src_dim2_pitch = 64, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 8, + .in_dim1_pitch = 10, + .in_dim2_size = 5, + .in_dim2_pitch = 50, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 11, + .in_rows_firstdma = 4, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 16, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 256, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 2304, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 12800, + .coeff_buffer_size = 36864, + .output_buffer_size = 128, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 16, + .n_tile_size_last = 16, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 22, + .layer_name = "conv_1x1_s1_ic256_oc1024", + .kernel_name = "1x1j1d1_dma", + .config_key = "256_4_4_1024_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 256, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 1024, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 128, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 256, + .coeff_dim4_size = 1024, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 256, + .bias_dim1_size = 1024, + .bias_dim2_size = 1, + .outscale_dim1_size = 1024, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 1024, + .bias_buffer_size = 4096, + .outscale_buffer_size = 2048, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 128, + .n_tiles = 8, + .n_tile_size_last = 128, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 23, + .layer_name = "conv_1x1_s1_ic1024_oc256", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_256_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 256, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 256, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 256, + .bias_dim2_size = 1, + .outscale_dim1_size = 256, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 1024, + .outscale_buffer_size = 512, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 8, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 24, + .layer_name = "conv_1x1_s2_ic1024_oc2048", + .kernel_name = "1x1j2d1_dma", + .config_key = "1024_4_4_2048_1_1_2_2_2_2_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 3, + .in_dim2_pitch = 12, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 3, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 12288, + .coeff_buffer_size = 32768, + .output_buffer_size = 128, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 64, + .n_tile_size_last = 32, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 3, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 2, + .stride_y = 2, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 25, + .layer_name = "conv_1x1_s1_ic1024_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "1024_4_4_512_1_1_4_4_1_1_0_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 1024, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 4, + .dst_dim2_size = 4, + .dst_dim3_size = 512, + .dst_dim1_pitch = 4, + .dst_dim2_pitch = 16, + .in_dim1_size = 4, + .in_dim1_pitch = 4, + .in_dim2_size = 2, + .in_dim2_pitch = 8, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 4, + .out_dim1_pitch = 4, + .out_dim2_size = 2, + .out_dim2_pitch = 8, + .out_dim3_size = 32, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 1024, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 1024, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 32, + .n_tiles = 16, + .n_tile_size_last = 32, + .height_tiles = 2, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 26, + .layer_name = "conv_3x3_s2_ic512_oc512", + .kernel_name = "3x3j2d1_dma", + .config_key = "512_4_4_512_3_3_2_2_2_2_1_1", + .src_dim1_size = 4, + .src_dim2_size = 4, + .src_dim3_size = 512, + .src_dim1_pitch = 4, + .src_dim2_pitch = 16, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 4, + .in_dim1_pitch = 6, + .in_dim2_size = 5, + .in_dim2_pitch = 30, + .in_dim1_edge1 = 1, + .in_dim1_edge2 = 1, + .in_dim2_edge1 = 1, + .in_dim2_edge2 = 1, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 7, + .in_rows_firstdma = 4, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 8, + .coeff_dim1_size = 3, + .coeff_dim2_size = 3, + .coeff_dim3_size = 512, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 3, + .coeff_dim2_pitch = 9, + .coeff_dim3_pitch = 4608, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 15360, + .coeff_buffer_size = 36864, + .output_buffer_size = 32, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 8, + .n_tiles = 64, + .n_tile_size_last = 8, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 5, + .kernel_w = 3, + .kernel_h = 3, + .stride_x = 2, + .stride_y = 2, + .padding = 1, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 27, + .layer_name = "conv_1x1_s1_ic512_oc2048", + .kernel_name = "1x1j1d1_dma", + .config_key = "512_2_2_2048_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 512, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 2048, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 64, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 512, + .coeff_dim4_size = 2048, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 512, + .bias_dim1_size = 2048, + .bias_dim2_size = 1, + .outscale_dim1_size = 2048, + .outscale_dim2_size = 1, + .input_buffer_size = 2048, + .coeff_buffer_size = 32768, + .output_buffer_size = 256, + .bias_buffer_size = 8192, + .outscale_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 64, + .n_tiles = 32, + .n_tile_size_last = 64, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, + { + .layer_id = 28, + .layer_name = "conv_1x1_s1_ic2048_oc512", + .kernel_name = "1x1j1d1_dma", + .config_key = "2048_2_2_512_1_1_2_2_1_1_0_1", + .src_dim1_size = 2, + .src_dim2_size = 2, + .src_dim3_size = 2048, + .src_dim1_pitch = 2, + .src_dim2_pitch = 4, + .dst_dim1_size = 2, + .dst_dim2_size = 2, + .dst_dim3_size = 512, + .dst_dim1_pitch = 2, + .dst_dim2_pitch = 4, + .in_dim1_size = 2, + .in_dim1_pitch = 2, + .in_dim2_size = 2, + .in_dim2_pitch = 4, + .in_dim1_edge1 = 0, + .in_dim1_edge2 = 0, + .in_dim2_edge1 = 0, + .in_dim2_edge2 = 0, + .in_dim3_edge1 = 0, + .in_dim3_edge2 = 0, + .in_data_offset = 0, + .in_rows_firstdma = 2, + .out_dim1_size = 2, + .out_dim1_pitch = 2, + .out_dim2_size = 2, + .out_dim2_pitch = 4, + .out_dim3_size = 16, + .coeff_dim1_size = 1, + .coeff_dim2_size = 1, + .coeff_dim3_size = 2048, + .coeff_dim4_size = 512, + .coeff_dim1_pitch = 1, + .coeff_dim2_pitch = 1, + .coeff_dim3_pitch = 2048, + .bias_dim1_size = 512, + .bias_dim2_size = 1, + .outscale_dim1_size = 512, + .outscale_dim2_size = 1, + .input_buffer_size = 8192, + .coeff_buffer_size = 32768, + .output_buffer_size = 64, + .bias_buffer_size = 2048, + .outscale_buffer_size = 1024, + .input_ping_dram = 0, + .input_pong_dram = 0, + .coeff_dram = 0, + .output_ping_dram = 1, + .output_pong_dram = 1, + .bias_dram = 1, + .outscale_dram = 1, + .n_tile_size = 16, + .n_tiles = 32, + .n_tile_size_last = 16, + .height_tiles = 1, + .output_rows = 2, + .input_rows = 2, + .kernel_w = 1, + .kernel_h = 1, + .stride_x = 1, + .stride_y = 1, + .padding = 0, + .dilation = 1, + .accum_shift = 8, + .relu_max = 4000, + .relu_min = 0, + .output_shift = 11, + .output_scale = 0, + .flags = 0, + .input_zero_point = 0, + }, +}; + +static inline int get_num_conv_layers(void) { return NUM_CONV_LAYERS; } + +static inline const conv_layer_config_t* get_conv_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_CONV_LAYERS) return NULL; + return &CONV_LAYER_CONFIGS[layer_id]; +} + +static inline const conv_layer_config_t* get_layer_config_by_params( + int ic, int ih, int iw, + int oc, int kh, int kw, + int oh, int ow, + int sy, int sx, + int pad, int dil) +{ + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->src_dim3_size == ic && + cfg->src_dim2_size == ih && + cfg->src_dim1_size == iw && + cfg->dst_dim3_size == oc && + cfg->coeff_dim2_size == kh && + cfg->coeff_dim1_size == kw && + cfg->dst_dim2_size == oh && + cfg->dst_dim1_size == ow && + cfg->stride_y == sy && + cfg->stride_x == sx && + cfg->padding == pad && + cfg->dilation == dil) + return cfg; + } + return NULL; +} + +static inline const conv_layer_config_t* get_layer_config_by_key(const char* config_key) { + if (config_key == NULL) return NULL; + for (int i = 0; i < NUM_CONV_LAYERS; i++) { + const conv_layer_config_t* cfg = &CONV_LAYER_CONFIGS[i]; + if (cfg->config_key != NULL) { + const char* a = config_key; + const char* b = cfg->config_key; + while (*a && *b && *a == *b) { a++; b++; } + if (*a == '\0' && *b == '\0') return cfg; + } + } + return NULL; +} + +/* ====================================================================== */ +/* MaxPool configurations */ +/* ====================================================================== */ + +typedef struct { + int layer_id; + const char* layer_name; + const char* config_key; + + int src_width; int src_height; int channels; + int dst_width; int dst_height; + + int src_row_pitch; int src_plane_pitch; + int dst_row_pitch; int dst_plane_pitch; + + int kernel_h; int kernel_w; + int stride_h; int stride_w; + int pad_h; int pad_w; + + int in_tile_w; int in_tile_rows; int in_tile_plane; + int in_data_offset; + int out_tile_w; int out_tile_rows; int out_tile_plane; + + int c_tile_size; int c_tiles; int c_tile_size_last; + int height_tiles; int output_rows; int input_rows; + + int input_buffer_size; int output_buffer_size; + + int input_ping_dram; int input_pong_dram; + int output_ping_dram; int output_pong_dram; +} maxpool_layer_config_t; + +#define NUM_MAXPOOL_LAYERS 1 + +static const maxpool_layer_config_t MAXPOOL_LAYER_CONFIGS[] = { + { + .layer_id = 0, + .layer_name = "maxpool_3x3s2_c64_32x32", + .config_key = "64_32_32_3_3_2_2_1_1", + .src_width = 32, + .src_height = 32, + .channels = 64, + .dst_width = 16, + .dst_height = 16, + .src_row_pitch = 32, + .src_plane_pitch = 1024, + .dst_row_pitch = 16, + .dst_plane_pitch = 256, + .kernel_h = 3, + .kernel_w = 3, + .stride_h = 2, + .stride_w = 2, + .pad_h = 1, + .pad_w = 1, + .in_tile_w = 34, + .in_tile_rows = 5, + .in_tile_plane = 170, + .in_data_offset = 35, + .out_tile_w = 16, + .out_tile_rows = 1, + .out_tile_plane = 16, + .c_tile_size = 64, + .c_tiles = 1, + .c_tile_size_last = 64, + .height_tiles = 16, + .output_rows = 1, + .input_rows = 3, + .input_buffer_size = 43520, + .output_buffer_size = 4096, + .input_ping_dram = 0, + .input_pong_dram = 1, + .output_ping_dram = 1, + .output_pong_dram = 0, + }, +}; + +static inline int get_num_maxpool_layers(void) { return NUM_MAXPOOL_LAYERS; } + +static inline const maxpool_layer_config_t* get_maxpool_config(int layer_id) { + if (layer_id < 0 || layer_id >= NUM_MAXPOOL_LAYERS) return NULL; + return &MAXPOOL_LAYER_CONFIGS[layer_id]; +} + +static inline const maxpool_layer_config_t* get_maxpool_config_by_params( + int channels, int src_height, int src_width, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w) +{ + for (int i = 0; i < NUM_MAXPOOL_LAYERS; i++) { + const maxpool_layer_config_t* c = &MAXPOOL_LAYER_CONFIGS[i]; + if (c->channels == channels && + c->src_height == src_height && + c->src_width == src_width && + c->kernel_h == kernel_h && + c->kernel_w == kernel_w && + c->stride_h == stride_h && + c->stride_w == stride_w && + c->pad_h == pad_h && + c->pad_w == pad_w) + return c; + } + return NULL; +} + +#endif /* LAYER_CONFIGS_H */ diff --git a/backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c b/backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c new file mode 100644 index 00000000000..3d841517606 --- /dev/null +++ b/backends/cadence/vision/operators/maxpool/maxpool_exec_mxnj2.c @@ -0,0 +1,352 @@ +#include "maxpool_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include +#include + +/* Minimal float definitions to avoid pulling in full math.h */ +#ifndef MIN_FLT32 +#define MIN_FLT32 (-3.402823466e+38F) +#endif + +/* HW-optimised maxpool kernel (in library) */ +extern void maxpool2d_j2x2_f32( + float* restrict ptr_out, + const float* restrict ptr_inp, + int inp_height, int inp_width, + int out_height, int out_width, + int in_pitch_width, int in_pitch_height, + int out_pitch_width, int out_pitch_height, + unsigned char kernel_height, + unsigned char kernel_width); + +/* ---------------------------------------------------------------------- */ +/* Helper: fill a float buffer with a constant value (e.g. MIN_FLT32) */ +/* ---------------------------------------------------------------------- */ +static void fill_buffer_f32(float* buf, float val, int count) +{ + for (int i = 0; i < count; i++) { + buf[i] = val; + } +} + +/* ---------------------------------------------------------------------- */ +/* Helper: swap two pointers */ +/* ---------------------------------------------------------------------- */ +static inline void swap_f32_ptrs(float** a, float** b) +{ + float* t = *a; + *a = *b; + *b = t; +} + +/* ====================================================================== */ +/* DMA-tiled executor */ +/* ====================================================================== */ +XAI_ERR_TYPE maxpool_exec_mxnj2( + float* src, + float* dst, + const maxpool_layer_config_t* config) +{ + /* ================================================================== */ + /* SECTION 1: DRAM Buffer Allocation */ + /* ================================================================== */ + int dram0_used = 0; + int dram1_used = 0; + + int8_t* raw_in0 = allocate_dram_buffer(config->input_buffer_size, + config->input_ping_dram, + &dram0_used, &dram1_used); + int8_t* raw_in1 = allocate_dram_buffer(config->input_buffer_size, + config->input_pong_dram, + &dram0_used, &dram1_used); + int8_t* raw_out0 = allocate_dram_buffer(config->output_buffer_size, + config->output_ping_dram, + &dram0_used, &dram1_used); + int8_t* raw_out1 = allocate_dram_buffer(config->output_buffer_size, + config->output_pong_dram, + &dram0_used, &dram1_used); + + if (!raw_in0 || !raw_in1 || !raw_out0 || !raw_out1) { + return (-1); + } + + /* Cast to float pointers for kernel calls */ + float* p_input0 = (float*)raw_in0; + float* p_input1 = (float*)raw_in1; + float* p_output0 = (float*)raw_out0; + float* p_output1 = (float*)raw_out1; + + /* ================================================================== */ + /* SECTION 2: Initialise DMA engines */ + /* ================================================================== */ + dma_3dm_init(1); /* ch1: 3D input prefetch */ + dma_2dm_init(0); /* ch0: 2D output writeback */ + + /* ================================================================== */ + /* SECTION 3: Load first input tile */ + /* ================================================================== */ + /* + * The first tile starts at source row 0. For kernels with pad_h > 0 + * the buffer is pre-filled with MIN_FLT32 (identity for max) and + * data is placed at in_data_offset = pad_h*in_tile_w + pad_w, so the + * leading MIN_FLT32 rows/columns act as top/left padding. + * + * For subsequent tiles the DMA offset is recomputed per-tile to + * account for kernel overlap (kernel_h > stride_h). + */ + fill_buffer_f32(p_input0, MIN_FLT32, + config->c_tile_size * config->in_tile_plane); + + /* + * Compute actual source rows for tile 0. + * Conceptual first input row = 0*stride_h - pad_h = -pad_h. + * top_pad rows are supplied by the MIN_FLT32 fill. + */ + int first_in_end = (config->output_rows - 1) * config->stride_h + - config->pad_h + config->kernel_h - 1; + int first_load_rows = (first_in_end >= config->src_height + ? config->src_height - 1 : first_in_end) + - 0 + 1; /* src starts at row 0 */ + + /* First DMA: c_tile_size planes, first_load_rows rows each */ + dma_3dm(1, + /* src */ (void*)src, + /* dst */ (void*)&p_input0[config->in_data_offset], + /* src_row_pitch */ config->src_width * (int)sizeof(float), + /* dst_row_pitch */ config->in_tile_w * (int)sizeof(float), + /* src_tile_pitch */ config->src_plane_pitch * (int)sizeof(float), + /* dst_tile_pitch */ config->in_tile_plane * (int)sizeof(float), + /* row_sz */ config->src_width * (int)sizeof(float), + /* nrows */ first_load_rows, + /* ntiles */ config->c_tile_size); + + idma_hw_wait_all(1); /* input ready */ + + /* ================================================================== */ + /* SECTION 4: Tiled Execution Loop (C-tiles x H-tiles) */ + /* ================================================================== */ + int last_tile = 1; + + for (int idx_c = 0; idx_c < config->c_tiles; idx_c++) { + int last_c_tile = (last_tile) && (idx_c == config->c_tiles - 1); + int current_c = (idx_c < config->c_tiles - 1) + ? config->c_tile_size + : config->c_tile_size_last; + + for (int idx_h = 0; idx_h < config->height_tiles; idx_h++) { + int last_h_tile = (last_c_tile) && + (idx_h == config->height_tiles - 1); + + /* Output rows for this tile (last tile may be shorter) */ + int cur_out_rows = (idx_h < config->height_tiles - 1) + ? config->output_rows + : (config->dst_height - + config->output_rows * idx_h); + int cur_in_rows = cur_out_rows * config->stride_h; + + /* ========================================================== */ + /* Prefetch next input tile into pong buffer */ + /* ========================================================== */ + if (!last_h_tile) { + /* Determine next (c, h) indices */ + int next_c = idx_c; + int next_h = idx_h + 1; + if (next_h >= config->height_tiles) { + next_h = 0; + next_c = idx_c + 1; + } + int next_c_start = config->c_tile_size * next_c; + int next_c_size = (next_c < config->c_tiles - 1) + ? config->c_tile_size + : config->c_tile_size_last; + + /* + * Compute source-row start, load count, and DMA + * destination offset for the next height tile. + * + * For kernel_h > stride_h (e.g. 3x3/s2) consecutive + * tiles overlap in the source by (kernel_h - stride_h) + * rows, so the stride between tiles in source space is + * output_rows * stride_h, NOT input_rows. + */ + int next_out_start = config->output_rows * next_h; + int next_in_first = next_out_start * config->stride_h + - config->pad_h; + int next_top_pad = (next_in_first < 0) + ? -next_in_first : 0; + int next_src_row = next_in_first + next_top_pad; + + int next_actual_out = + (next_h < config->height_tiles - 1) + ? config->output_rows + : (config->dst_height - next_out_start); + int next_in_last = + (next_out_start + next_actual_out - 1) + * config->stride_h + - config->pad_h + config->kernel_h - 1; + int next_in_end_clamped = + (next_in_last >= config->src_height) + ? config->src_height - 1 + : next_in_last; + int next_load_rows = next_in_end_clamped + - next_src_row + 1; + + /* DMA offset: top_pad rows of MIN_FLT32 + left pad */ + int next_dma_offset = next_top_pad * config->in_tile_w + + config->pad_w; + + fill_buffer_f32(p_input1, MIN_FLT32, + next_c_size * config->in_tile_plane); + + dma_3dm(1, + /* src */ + (void*)&src[next_c_start * config->src_plane_pitch + + next_src_row * config->src_width], + /* dst */ + (void*)&p_input1[next_dma_offset], + /* src_row_pitch */ + config->src_width * (int)sizeof(float), + /* dst_row_pitch */ + config->in_tile_w * (int)sizeof(float), + /* src_tile_pitch */ + config->src_plane_pitch * (int)sizeof(float), + /* dst_tile_pitch */ + config->in_tile_plane * (int)sizeof(float), + /* row_sz */ + config->src_width * (int)sizeof(float), + /* nrows */ + next_load_rows, + /* ntiles */ + next_c_size); + } + + /* ========================================================== */ + /* Execute maxpool on current input tile */ + /* ========================================================== */ + for (int c = 0; c < current_c; c++) { + /* + * Pass the kernel a pointer to the START of the + * padded tile (row 0, col 0 of the buffer). The + * MIN_FLT32 fill provides top/left/right/bottom + * padding; the kernel reads through them naturally + * via its ky/kx loops. + * + * NOTE: the old code added in_data_offset here, + * which skipped past the padding and produced wrong + * results for any kernel with pad_h or pad_w > 0. + */ + float* in_plane = &p_input0[c * config->in_tile_plane]; + float* out_plane = &p_output1[c * config->out_tile_plane]; + + maxpool2d_j2x2_f32( + out_plane, + in_plane, + cur_in_rows, /* inp_height */ + config->src_width, /* inp_width */ + cur_out_rows, /* out_height */ + config->dst_width, /* out_width */ + config->in_tile_w, /* in_pitch_width */ + config->in_tile_plane, /* in_pitch_height */ + config->dst_width, /* out_pitch_width */ + config->out_tile_plane, /* out_pitch_height*/ + (unsigned char)config->kernel_h, + (unsigned char)config->kernel_w); + } + + /* ========================================================== */ + /* Write output tile back to system memory via 2D DMA */ + /* ========================================================== */ + { + int c_start = config->c_tile_size * idx_c; + int h_out_start = config->output_rows * idx_h; + int row_bytes = config->dst_width * cur_out_rows + * (int)sizeof(float); + + dma_2dm(0, + /* src */ (void*)p_output1, + /* dst */ (void*)&dst[c_start * + config->dst_plane_pitch + + h_out_start * config->dst_width], + /* src_stride */ config->out_tile_plane * + (int)sizeof(float), + /* dst_stride */ config->dst_plane_pitch * + (int)sizeof(float), + /* row_size */ row_bytes, + /* num_lines */ (short)current_c); + } + + /* Swap ping-pong buffers */ + swap_f32_ptrs(&p_output0, &p_output1); + swap_f32_ptrs(&p_input0, &p_input1); + } + } + + /* Wait for last output DMA before returning */ + idma_hw_wait_all(0); + + return XAI_ERR_OK; +} + +/* ====================================================================== */ +/* Cache-mode fallback (no DMA, data accessed via processor cache) */ +/* ====================================================================== */ +XAI_ERR_TYPE maxpool_exec_mxnj2_no_dma( + float* src, + float* dst, + const maxpool_layer_config_t* config) +{ + int padded_w = config->src_width + 2 * config->pad_w; + int padded_h = config->src_height + 2 * config->pad_h; + int plane_size = padded_w * padded_h; + int total_size = plane_size * config->channels; + + /* Use shared padded-input scratch buffer from memory manager */ + int8_t* raw_buf = get_cache_padded_input(); + if (total_size * (int)sizeof(float) > (int)get_cache_padded_input_size()) { + return (-1); /* buffer too small */ + } + float* padded = (float*)raw_buf; + + /* Fill with MIN_FLT32 (identity for max) */ + fill_buffer_f32(padded, MIN_FLT32, total_size); + + /* Copy source data into padded buffer at correct offset */ + int data_off = config->pad_h * padded_w + config->pad_w; + + for (int c = 0; c < config->channels; c++) { + for (int h = 0; h < config->src_height; h++) { + memcpy(&padded[c * plane_size + data_off + h * padded_w], + &src[c * config->src_plane_pitch + h * config->src_width], + config->src_width * sizeof(float)); + } + } + + /* Run maxpool per channel plane. + * Pass the pointer at the START of the padded buffer (row 0, col 0) + * so the kernel's ky/kx loops read through the MIN_FLT32 padding. */ + for (int c = 0; c < config->channels; c++) { + float* in_plane = &padded[c * plane_size]; + float* out_plane = &dst[c * config->dst_plane_pitch]; + + maxpool2d_j2x2_f32( + out_plane, + in_plane, + config->src_height, + config->src_width, + config->dst_height, + config->dst_width, + padded_w, + plane_size, + config->dst_width, + config->dst_plane_pitch, + (unsigned char)config->kernel_h, + (unsigned char)config->kernel_w); + } + + /* Writeback output from cache */ + xthal_dcache_region_writeback(dst, + config->dst_plane_pitch * config->channels * (int)sizeof(float)); + + return XAI_ERR_OK; +} diff --git a/backends/cadence/vision/operators/maxpool/maxpool_executors.h b/backends/cadence/vision/operators/maxpool/maxpool_executors.h new file mode 100644 index 00000000000..90c44258bbb --- /dev/null +++ b/backends/cadence/vision/operators/maxpool/maxpool_executors.h @@ -0,0 +1,61 @@ +/* + * maxpool_executors.h + * + * Created on: Apr 21, 2026 + * Author: Suraj Raut + * + * Description: + * Function declarations for DMA-tiled maxpool executors. + * Parallels conv/kernel_executors.h for the maxpool operator. + */ + +#ifndef MAXPOOL_EXECUTORS_H_ +#define MAXPOOL_EXECUTORS_H_ + +#include "../layer_configs.h" + +#ifndef XAI_ERR_TYPE +typedef int XAI_ERR_TYPE; +#define XAI_ERR_OK 0 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Execute MxN stride-2 maxpool with DMA tiling. + * + * Operates on float32 data in NCHW layout (one batch at a time). + * Uses ping-pong DMA transfers on local DRAM for overlap of + * DMA and computation. + * + * @param src System-memory pointer to input [C x H x W] float32 + * @param dst System-memory pointer to output [C x OH x OW] float32 + * @param config Pre-computed layer configuration (buffer sizes, tiling, etc.) + * @return XAI_ERR_OK on success + */ +XAI_ERR_TYPE maxpool_exec_mxnj2( + float* src, + float* dst, + const maxpool_layer_config_t* config); + +/** + * Execute MxN stride-2 maxpool without DMA (data accessed via processor cache). + * Fallback path when DRAM buffers are not available. + * + * @param src System-memory pointer to input [C x H x W] float32 + * @param dst System-memory pointer to output [C x OH x OW] float32 + * @param config Layer configuration (only dimension fields used) + * @return XAI_ERR_OK on success + */ +XAI_ERR_TYPE maxpool_exec_mxnj2_no_dma( + float* src, + float* dst, + const maxpool_layer_config_t* config); + +#ifdef __cplusplus +} +#endif + +#endif /* MAXPOOL_EXECUTORS_H_ */ diff --git a/backends/cadence/vision/operators/mean/mean_exec_dma.c b/backends/cadence/vision/operators/mean/mean_exec_dma.c new file mode 100644 index 00000000000..e732fede559 --- /dev/null +++ b/backends/cadence/vision/operators/mean/mean_exec_dma.c @@ -0,0 +1,149 @@ +#include "mean_executors.h" +#include "memory_manager.h" +#include "dma.h" +#include + +/* SIMD mean kernel (in library) */ +extern void simd_mean_pool_2x2_to_1x1_float32( + float* restrict output, + const float* restrict input, + int N); + +/* ---------------------------------------------------------------------- */ +/* Helper: swap two float pointers */ +/* ---------------------------------------------------------------------- */ +static inline void swap_ptrs(float** a, float** b) +{ + float* t = *a; *a = *b; *b = t; +} + +/* ====================================================================== */ +/* DMA-tiled mean executor with ping-pong */ +/* ====================================================================== */ +XAI_ERR_TYPE mean_exec_dma( + const float* src, + float* dst, + int channels, + int spatial_h, + int spatial_w) +{ + int spatial = spatial_h * spatial_w; /* e.g. 4 for 2x2 */ + + /* ================================================================== */ + /* Compute tiling: how many channels per chunk? */ + /* */ + /* Each DRAM bank holds one ping or pong set: */ + /* input_chunk = chunk_ch * spatial * sizeof(float) */ + /* output_chunk = chunk_ch * sizeof(float) */ + /* total = chunk_ch * (spatial + 1) * 4 */ + /* */ + /* chunk_ch must be a multiple of 16 (SIMD processes 16 ch/iteration). */ + /* ================================================================== */ + int bytes_per_ch = (spatial + 1) * (int)sizeof(float); + int chunk_ch = IDMA_BUFFER_SIZE_DRAM0 / bytes_per_ch; + chunk_ch = (chunk_ch / 16) * 16; /* round down to SIMD multiple */ + + if (chunk_ch < 16) { + return (-1); /* DRAM too small */ + } + + /* Cap to actual channel count (round up to multiple of 16 for last tile) */ + if (chunk_ch > channels) { + chunk_ch = ((channels + 15) / 16) * 16; + } + + int inp_chunk_bytes = chunk_ch * spatial * (int)sizeof(float); + int out_chunk_bytes = chunk_ch * (int)sizeof(float); + + /* ================================================================== */ + /* Buffer allocation: ping in DRAM0, pong in DRAM1 */ + /* Each bank: [ input_chunk | output_chunk ] */ + /* ================================================================== */ + float* inp_ping = (float*)dram0_pool; + float* out_ping = (float*)(dram0_pool + inp_chunk_bytes); + float* inp_pong = (float*)dram1_pool; + float* out_pong = (float*)(dram1_pool + inp_chunk_bytes); + + /* ================================================================== */ + /* Initialise DMA engines */ + /* ================================================================== */ + dma_2dm_init(0); /* ch0: output writeback */ + dma_2dm_init(1); /* ch1: input prefetch */ + + /* ================================================================== */ + /* Load first input chunk (serial — no overlap possible) */ + /* ================================================================== */ + int ch_done = 0; + int cur_ch = (channels - ch_done > chunk_ch) + ? chunk_ch + : channels - ch_done; + int cur_inp_bytes = cur_ch * spatial * (int)sizeof(float); + + dma_1dm(1, (void*)&src[ch_done * spatial], (void*)inp_ping, cur_inp_bytes); + idma_hw_wait_all(1); + + /* ================================================================== */ + /* Tiled execution loop with ping-pong */ + /* ================================================================== */ + float* p_inp_cur = inp_ping; + float* p_out_cur = out_ping; + float* p_inp_next = inp_pong; + float* p_out_next = out_pong; + + while (ch_done < channels) { + int this_ch = cur_ch; + int next_ch_start = ch_done + this_ch; + int have_next = (next_ch_start < channels); + + /* ============================================================== */ + /* Prefetch next input chunk into pong buffer (async) */ + /* ============================================================== */ + int next_ch = 0; + if (have_next) { + next_ch = (channels - next_ch_start > chunk_ch) + ? chunk_ch + : channels - next_ch_start; + int next_inp_bytes = next_ch * spatial * (int)sizeof(float); + + dma_1dm(1, (void*)&src[next_ch_start * spatial], + (void*)p_inp_next, next_inp_bytes); + /* DMA runs in background while we compute below */ + } + + /* ============================================================== */ + /* Execute SIMD mean on current chunk */ + /* ============================================================== */ + simd_mean_pool_2x2_to_1x1_float32( + p_out_cur, + p_inp_cur, + this_ch * spatial); + + /* ============================================================== */ + /* Write output chunk to system memory (async) */ + /* ============================================================== */ + int cur_out_bytes = this_ch * (int)sizeof(float); + dma_1dm(0, (void*)p_out_cur, + (void*)&dst[ch_done], cur_out_bytes); + + /* ============================================================== */ + /* Wait for input DMA of next tile to finish */ + /* (In a well-tuned pipeline, DMA finishes during compute above) */ + /* ============================================================== */ + if (have_next) { + idma_hw_wait_all(1); + } + + /* Wait for output DMA before reusing this buffer as next pong */ + idma_hw_wait_all(0); + + /* Advance */ + ch_done = next_ch_start; + cur_ch = next_ch; + + /* Swap ping-pong: current pong becomes next ping */ + swap_ptrs(&p_inp_cur, &p_inp_next); + swap_ptrs(&p_out_cur, &p_out_next); + } + + return XAI_ERR_OK; +} diff --git a/backends/cadence/vision/operators/mean/mean_executors.h b/backends/cadence/vision/operators/mean/mean_executors.h new file mode 100644 index 00000000000..d56b45b4dc7 --- /dev/null +++ b/backends/cadence/vision/operators/mean/mean_executors.h @@ -0,0 +1,51 @@ +/* + * mean_executors.h + * + * Created on: Apr 22, 2026 + * Author: Suraj Raut + * + * Description: + * Function declarations for DMA-tiled mean (adaptive_avg_pool2d) executors. + * Parallels maxpool/maxpool_executors.h. + */ + +#ifndef MEAN_EXECUTORS_H_ +#define MEAN_EXECUTORS_H_ + +#ifndef XAI_ERR_TYPE +typedef int XAI_ERR_TYPE; +#define XAI_ERR_OK 0 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Execute mean pooling (adaptive_avg_pool2d) with DMA ping-pong tiling. + * + * Reduces [C x H x W] float32 to [C] by averaging all spatial elements. + * Currently optimized for H=2, W=2 (calls simd_mean_pool_2x2_to_1x1_float32). + * + * Uses ping-pong DMA: prefetches next input chunk while computing on current. + * Channel tiles are rounded to 16 for SIMD alignment. + * + * @param src System-memory pointer to input [C x H x W] float32 + * @param dst System-memory pointer to output [C] float32 + * @param channels Number of channels + * @param spatial_h Spatial height (must be 2 for optimized path) + * @param spatial_w Spatial width (must be 2 for optimized path) + * @return XAI_ERR_OK on success, -1 if buffers unavailable + */ +XAI_ERR_TYPE mean_exec_dma( + const float* src, + float* dst, + int channels, + int spatial_h, + int spatial_w); + +#ifdef __cplusplus +} +#endif + +#endif /* MEAN_EXECUTORS_H_ */ diff --git a/backends/cadence/vision/operators/op_add.cpp b/backends/cadence/vision/operators/op_add.cpp index 81014143275..8c76378618c 100644 --- a/backends/cadence/vision/operators/op_add.cpp +++ b/backends/cadence/vision/operators/op_add.cpp @@ -6,67 +6,327 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include +#include +#include #include #include using executorch::aten::Scalar; using executorch::aten::ScalarType; using executorch::aten::Tensor; -using executorch::runtime::canCast; +using executorch::runtime::can_cast; using executorch::runtime::KernelRuntimeContext; -using executorch::runtime::promoteTypes; -using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; -using torch::executor::native::utils::extract_scalar; namespace impl { namespace vision { namespace native { +// Forward declaration of hardware-optimized vector addition function +extern "C" void rvaddf( + float32_t* restrict z, + const float32_t* restrict x, + const float32_t* restrict y, + int N); + Tensor& add_out( KernelRuntimeContext& ctx, const Tensor& a, const Tensor& b, const Scalar& alpha, Tensor& out) { - (void)ctx; - - using namespace torch::executor::native::utils; - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); - ScalarType common_type = promoteTypes(a_type, b_type); - ScalarType out_type = out.scalar_type(); - - ET_CHECK_MSG(a_type == ScalarType::Float, "Input tensor not a float.\n"); - ET_CHECK_MSG(b_type == ScalarType::Float, "Input tensor not a float.\n"); - ET_CHECK_MSG(out_type == ScalarType::Float, "Output tensor not a float.\n"); - - ET_CHECK(canCast(common_type, out_type)); - - using CTYPE_A = float; - using CTYPE_B = float; - using CTYPE_IN = float; - using CTYPE_OUT = float; - CTYPE_IN alpha_val; - ET_CHECK_MSG( - extract_scalar(alpha, &alpha_val), - "Could not be extracted: wrong type or out of range"); - - apply_binary_elementwise_fn( - [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted + alpha_val * b_casted; - - return static_cast(value); - }, - a, - b, - out); + + // Check if we can use optimized path: same shape, float32, alpha=1.0 + bool same_shape = executorch::runtime::tensors_have_same_shape(a, b) && + executorch::runtime::tensors_have_same_shape(a, out); + bool is_float = (a.scalar_type() == ScalarType::Float) && + (b.scalar_type() == ScalarType::Float) && + (out.scalar_type() == ScalarType::Float); + + // Extract alpha value to check if it's 1.0 + float alpha_val = 1.0f; + bool alpha_is_one = false; + if (is_float && torch::executor::native::utils::extract_scalar(alpha, &alpha_val)) { + alpha_is_one = (alpha_val == 1.0f); + } + + size_t numel = out.numel(); + + // Use optimized path if: float32, same shape, alpha=1.0, sufficient size, aligned + // Require numel to be even (2 floats = 8 bytes) for 8-byte aligned DMA + bool use_optimized = same_shape && is_float && alpha_is_one && + (numel >= 8) && ((numel % 2) == 0); + + if (use_optimized) { + + const float* a_data = a.const_data_ptr(); + const float* b_data = b.const_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + // Check if source data is 8-byte aligned (required for DMA) + bool src_aligned = (((uintptr_t)a_data & 0x7) == 0) && + (((uintptr_t)b_data & 0x7) == 0) && + (((uintptr_t)out_data & 0x7) == 0); + + // DMA setup for two inputs + one output + bool ping_pong_process = false; + bool ping_process_pong = false; + size_t chunk_size = 0; + + float32_t* inp_a_buff[2]; + float32_t* inp_b_buff[2]; + float32_t* out_buff[2]; + + // Check if DRAM buffers are available + bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0); + bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + // DMA threshold - beneficial for larger tensors + const size_t DMA_THRESHOLD = 1024; + bool use_dma = (numel >= DMA_THRESHOLD) && src_aligned; + + // Strategy 1: Ping-pong processing (2 sets of buffers) + // Need to fit: 2 inputs + 1 output per buffer (3 float32 arrays total) + // Split: 33% input_a, 33% input_b, 33% output per DRAM + if (use_dma && dram0_available && dram1_available && (numel >= 2)) { + // Try 128-byte alignment first (optimal for rvaddf SIMD) + size_t per_array_128 = (IDMA_BUFFER_SIZE_DRAM0 / 3) & ~0x7F; // 128-byte alignment + size_t chunk_elements_128 = per_array_128 / FLT32_SIZE; + + // If 128-byte alignment gives us 0 chunks, try 8-byte alignment (minimum for float32) + size_t per_array = per_array_128; + size_t chunk_elements = chunk_elements_128; + + if (chunk_elements == 0) { + per_array = (IDMA_BUFFER_SIZE_DRAM0 / 3) & ~0x7; // Fallback to 8-byte alignment + chunk_elements = per_array / FLT32_SIZE; + } + + if (chunk_elements == 0) { + // Verify all buffers are 8-byte aligned + if (((uintptr_t)ptr_dram0 & 0x7) != 0 || ((uintptr_t)ptr_dram1 & 0x7) != 0) { + // Buffer base addresses not aligned, fall back to non-DMA + use_dma = false; + } + } else { + // DRAM0: input_a[0] | input_b[0] | output[0] (all 128-byte aligned) + inp_a_buff[0] = (float32_t*)ptr_dram0; + inp_b_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + per_array); + out_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + 2 * per_array); + + // DRAM1: input_a[1] | input_b[1] | output[1] (all 8-byte aligned) + inp_a_buff[1] = (float32_t*)ptr_dram1; + inp_b_buff[1] = (float32_t*)((uint8_t*)ptr_dram1 + per_array); + out_buff[1] = (float32_t*)((uint8_t*)ptr_dram1 + 2 * per_array); + + chunk_size = chunk_elements; + ping_pong_process = true; + } + } + + // Strategy 2: Ping-process-pong (1 set of buffers) + // Use DRAM0 entirely for inputs (50% a, 50% b), DRAM1 for output + if (use_dma && !ping_pong_process && dram0_available && dram1_available) { + size_t inp_per_array = (IDMA_BUFFER_SIZE_DRAM0 / 2) & ~0x7; // Round down to 8-byte boundary + size_t inp_capacity = inp_per_array / FLT32_SIZE; + size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1 / FLT32_SIZE; + + if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) { + inp_a_buff[0] = (float32_t*)ptr_dram0; + inp_b_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + inp_per_array); + out_buff[0] = (float32_t*)ptr_dram1; + + chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity; + ping_process_pong = true; + } + } + + if (ping_pong_process || ping_process_pong) { + // Writeback inputs from cache to system memory before DMA reads + xthal_dcache_region_writeback((void*)a_data, FLT32_SIZE * numel); + xthal_dcache_region_writeback((void*)b_data, FLT32_SIZE * numel); + + /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */ + dma_2dm_init(0); + dma_2dm_init(1); + + if (ping_pong_process) { + // Ping-pong processing for better throughput + size_t num_chunks = (numel + chunk_size - 1) / chunk_size; + if (num_chunks == 0) num_chunks = 1; + + int32_t pp_swap = 0; + + const float* ptr_a = a_data; + const float* ptr_b = b_data; + float* ptr_out = out_data; + + // Load first chunk (both inputs) into buffer 0 via ch0 + size_t current_chunk = (numel < chunk_size) ? numel : chunk_size; + + dma_1dm(0, (void*)ptr_a, inp_a_buff[pp_swap], FLT32_SIZE * current_chunk); + dma_1dm(0, (void*)ptr_b, inp_b_buff[pp_swap], FLT32_SIZE * current_chunk); + + size_t remaining = numel - current_chunk; + ptr_a += current_chunk; + ptr_b += current_chunk; + + // Pipeline: load (ch0) and store (ch1) overlap with processing + for (size_t i = 0; i < num_chunks - 1; i++) { + size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Wait for current loads to complete + idma_hw_wait_all(0); + + // Start loading next chunk into alternate buffer via ch0 + dma_1dm(0, (void*)ptr_a, inp_a_buff[pp_swap ^ 1], FLT32_SIZE * next_chunk); + dma_1dm(0, (void*)ptr_b, inp_b_buff[pp_swap ^ 1], FLT32_SIZE * next_chunk); + + // Process current buffer (ch0 loads next in parallel) + rvaddf(out_buff[pp_swap], inp_a_buff[pp_swap], inp_b_buff[pp_swap], (int)current_chunk); + + // Wait for previous store to complete before reusing out_buff + idma_hw_wait_all(1); + + // Store result via ch1 + dma_1dm(1, out_buff[pp_swap], (void*)ptr_out, FLT32_SIZE * current_chunk); + + ptr_a += next_chunk; + ptr_b += next_chunk; + ptr_out += current_chunk; + remaining -= next_chunk; + current_chunk = next_chunk; + pp_swap ^= 1; + } + + // Process last chunk + idma_hw_wait_all(0); + rvaddf(out_buff[pp_swap], inp_a_buff[pp_swap], inp_b_buff[pp_swap], (int)current_chunk); + + idma_hw_wait_all(1); + dma_1dm(1, out_buff[pp_swap], (void*)ptr_out, FLT32_SIZE * current_chunk); + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel); + + } + else if (ping_process_pong) { + // Sequential processing + size_t remaining = numel; + const float* ptr_a = a_data; + const float* ptr_b = b_data; + float* ptr_out = out_data; + + while (remaining > 0) { + size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Load both input chunks via ch0 (overlaps with any pending ch1 store) + dma_1dm(0, (void*)ptr_a, inp_a_buff[0], FLT32_SIZE * current_chunk); + dma_1dm(0, (void*)ptr_b, inp_b_buff[0], FLT32_SIZE * current_chunk); + // Wait for previous store to complete + idma_hw_wait_all(1); + // Wait for loads to complete + idma_hw_wait_all(0); + + // Process: out = a + b + rvaddf(out_buff[0], inp_a_buff[0], inp_b_buff[0], (int)current_chunk); + + // Store result via ch1 + dma_1dm(1, out_buff[0], (void*)ptr_out, FLT32_SIZE * current_chunk); + + ptr_a += current_chunk; + ptr_b += current_chunk; + ptr_out += current_chunk; + remaining -= current_chunk; + } + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel); + + } + } else { + // Fallback: use hardware-optimized vector addition directly without DMA + // Writeback+invalidate inputs: ensures CPU-dirty data reaches system memory, + // then invalidate forces re-read from system memory (fresh data) + xthal_dcache_region_writeback((void*)a_data, FLT32_SIZE * numel); + xthal_dcache_region_invalidate((void*)a_data, FLT32_SIZE * numel); + xthal_dcache_region_writeback((void*)b_data, FLT32_SIZE * numel); + xthal_dcache_region_invalidate((void*)b_data, FLT32_SIZE * numel); + rvaddf(out_data, a_data, b_data, (int)numel); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(out_data, FLT32_SIZE * numel); + + } + } else { + // Fallback: Use full generic portable implementation + // This handles: broadcasting, non-float dtypes, alpha!=1.0, small tensors, all corner cases + + + namespace utils = torch::executor::native::utils; + using torch::executor::check_alpha_type; + using torch::executor::promoteTypes; + using torch::executor::canCast; + using torch::executor::resize_to_broadcast_target_size; + using torch::executor::tensors_have_same_dim_order; + using torch::executor::Error; + + // Common Dtype + ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type()); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + check_alpha_type(utils::get_scalar_dtype(alpha), common_type)), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(a, b, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + // Compute Dtype + ScalarType compute_type = utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "add.out"; + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + CTYPE_COMPUTE val_alpha; + ET_KERNEL_CHECK( + ctx, utils::extract_scalar(alpha, &val_alpha), InvalidArgument, ); + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [val_alpha](const auto& val_a, const auto& val_b) { + return val_a + val_alpha * val_b; + }, + ctx, + a, + utils::SupportedTensorDtypes::REALHBBF16, + b, + utils::SupportedTensorDtypes::REALHBBF16, + out); + }); + + } return out; } diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp index daffecda1bf..41aeb5c20d6 100644 --- a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp +++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp @@ -6,18 +6,28 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include +#include #include +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; +using ::impl::generic::kernels::dequantize; + namespace impl { namespace vision { namespace native { -using executorch::aten::ScalarType; -using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; +// Forward declaration of hardware-optimized dequantize function +extern "C" void dequantize_asym8s_f32( + float32_t* restrict ptr_out, + const int8_t* restrict ptr_inp, + float32_t scale, + int zero_bias, + int N); -void dequantize_per_tensor_out( +Tensor& dequantize_per_tensor_out( KernelRuntimeContext& context, const Tensor& input, double scale, @@ -31,33 +41,284 @@ void dequantize_per_tensor_out( if (input.scalar_type() == ScalarType::Byte) { const uint8_t* input_data = input.const_data_ptr(); - kernels::dequantize( + dequantize( out_data, input_data, scale, zero_point, numel); } else if (input.scalar_type() == ScalarType::Char) { const int8_t* input_data = input.const_data_ptr(); - kernels::dequantize(out_data, input_data, scale, zero_point, numel); + + // Hardware-optimized int8 dequantization with DMA support + bool ping_pong_process = false; + bool ping_process_pong = false; + size_t chunk_size = 0; + + int8_t* inp_buff[2]; + float32_t* out_buff[2]; + + // Check if DRAM buffers are available + bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0); + bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + // DMA has overhead - only beneficial for larger tensors + // Threshold: 1024 elements (~1KB for int8, ~4KB for float32) + const size_t DMA_THRESHOLD = 1024; + bool use_dma = (numel >= DMA_THRESHOLD); + + // Strategy 1: Try ping-pong processing (2 input + 2 output buffers) + // Using 20/80 split: 20% for int8 input, 80% for float32 output in each DRAM + if (use_dma && dram0_available && dram1_available && (numel >= 2)) { + size_t inp_per_buffer = (IDMA_BUFFER_SIZE_DRAM0 * 1) / 5; // 20% for int8 input (in bytes) + size_t out_per_buffer = (IDMA_BUFFER_SIZE_DRAM0 * 4) / (5 * FLT32_SIZE); // 80% for float32 output + + // Check if 20/80 split fits in both DRAMs + if ((inp_per_buffer > 0) && + (out_per_buffer >= inp_per_buffer) && + ((IDMA_BUFFER_SIZE_DRAM0 * 1) / 5 + (IDMA_BUFFER_SIZE_DRAM0 * 4) / 5 <= IDMA_BUFFER_SIZE_DRAM0) && + ((IDMA_BUFFER_SIZE_DRAM1 * 1) / 5 + (IDMA_BUFFER_SIZE_DRAM1 * 4) / 5 <= IDMA_BUFFER_SIZE_DRAM1)) { + + // Allocate buffers with 20/80 split + inp_buff[0] = (int8_t*)ptr_dram0; + out_buff[0] = (float32_t*)((uint8_t*)ptr_dram0 + (IDMA_BUFFER_SIZE_DRAM0 * 1) / 5); + + inp_buff[1] = (int8_t*)ptr_dram1; + out_buff[1] = (float32_t*)((uint8_t*)ptr_dram1 + (IDMA_BUFFER_SIZE_DRAM1 * 1) / 5); + + chunk_size = inp_per_buffer; + ping_pong_process = true; + } + } + + // Strategy 2: Fallback to ping-process-pong (1 input + 1 output buffer) + // Use full DRAM0 for input, full DRAM1 for output (no split needed) + if (use_dma && !ping_pong_process && dram0_available && dram1_available) { + size_t inp_capacity = IDMA_BUFFER_SIZE_DRAM0; // Full DRAM0 for int8 input (in bytes) + size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1 / FLT32_SIZE; // Full DRAM1 for float32 output + + if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) { + inp_buff[0] = (int8_t*)ptr_dram0; + out_buff[0] = (float32_t*)ptr_dram1; + + chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity; + ping_process_pong = true; + } + } + + if (ping_pong_process || ping_process_pong) { + const int8_t* ptr_inp = input_data; + + // Writeback input from cache to system memory before DMA reads + xthal_dcache_region_writeback((void*)input_data, sizeof(int8_t) * numel); + + /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */ + dma_2dm_init(0); + dma_2dm_init(1); + + if (ping_pong_process) { + // Ping-pong processing for better throughput + size_t num_chunks = (numel + chunk_size - 1) / chunk_size; + + if (num_chunks == 0) num_chunks = 1; + + int32_t pp_swap = 0; + + int8_t* ptr_in = (int8_t*)ptr_inp; + float32_t* ptr_out = out_data; + + // Load first chunk via ch0 + size_t current_chunk = (numel < chunk_size) ? numel : chunk_size; + + dma_1dm(0, ptr_in, inp_buff[pp_swap], sizeof(int8_t) * current_chunk); + + size_t remaining = numel - current_chunk; + ptr_in += current_chunk; + + // Pipeline: load (ch0) and store (ch1) overlap with processing + for (size_t i = 0; i < (num_chunks - 1); i++) { + size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Wait for current load to complete + idma_hw_wait_all(0); + + // Start loading next chunk into alternate buffer via ch0 + dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], sizeof(int8_t) * next_chunk); + + // Process current chunk (ch0 loads next in parallel) + dequantize_asym8s_f32(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk); + + // Wait for previous store to complete before reusing out_buff + idma_hw_wait_all(1); + + // Store result via ch1 + dma_1dm(1, out_buff[pp_swap], ptr_out, FLT32_SIZE * current_chunk); + + ptr_in += next_chunk; + ptr_out += current_chunk; + remaining -= next_chunk; + current_chunk = next_chunk; + pp_swap ^= 1; + } + + // Process last chunk + idma_hw_wait_all(0); + dequantize_asym8s_f32(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk); + + idma_hw_wait_all(1); + dma_1dm(1, out_buff[pp_swap], ptr_out, FLT32_SIZE * current_chunk); + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel); + + } + else if (ping_process_pong) { + // Simple sequential processing + size_t remaining = numel; + int8_t* ptr_in = (int8_t*)ptr_inp; + float32_t* ptr_out = out_data; + + while (remaining > 0) { + size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Start load via ch0 (overlaps with any pending ch1 store) + dma_1dm(0, ptr_in, inp_buff[0], sizeof(int8_t) * current_chunk); + // Wait for previous store to complete (out_buff[0] safe to write) + idma_hw_wait_all(1); + // Wait for load to complete + idma_hw_wait_all(0); + + // Process + dequantize_asym8s_f32(out_buff[0], inp_buff[0], (float)scale, (int)zero_point, (int)current_chunk); + + // Store via ch1 + dma_1dm(1, out_buff[0], ptr_out, FLT32_SIZE * current_chunk); + + ptr_in += current_chunk; + ptr_out += current_chunk; + remaining -= current_chunk; + } + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, FLT32_SIZE * numel); + + } + + } else { + // No DMA: use hardware function on full tensor at once + // Writeback+invalidate input: ensures CPU-dirty data reaches system memory, + // then invalidate forces re-read from system memory (fresh data) + xthal_dcache_region_writeback((void*)input_data, sizeof(int8_t) * numel); + xthal_dcache_region_invalidate((void*)input_data, sizeof(int8_t) * numel); + dequantize_asym8s_f32(out_data, input_data, (float)scale, (int)zero_point, (int)numel); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(out_data, sizeof(float) * numel); + + } } else if ( input.scalar_type() == ScalarType::Bits16 || input.scalar_type() == ScalarType::UInt16) { const uint16_t* input_data = input.const_data_ptr(); - kernels::dequantize( - out_data, input_data, scale, zero_point, numel); + dequantize(out_data, input_data, scale, zero_point, numel); } else if (input.scalar_type() == ScalarType::Short) { const int16_t* input_data = input.const_data_ptr(); - kernels::dequantize( - out_data, input_data, scale, zero_point, numel); + dequantize(out_data, input_data, scale, zero_point, numel); } else if (input.scalar_type() == ScalarType::Int) { const int32_t* input_data = input.const_data_ptr(); - kernels::dequantize( - out_data, input_data, scale, zero_point, numel); + dequantize(out_data, input_data, scale, zero_point, numel); } else { ET_CHECK_MSG( false, "Unhandled input dtype %hhd", static_cast(input.scalar_type())); } + return out; +} + +// int8 dequantization - uses generic template +Tensor& dequantize_per_tensor_asym8s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int8_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +// uint8 dequantization - uses generic template +Tensor& dequantize_per_tensor_asym8u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const uint8_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +// int16 dequantization - uses generic template +Tensor& dequantize_per_tensor_asym16s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int16_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +// uint16 dequantization - uses generic template +Tensor& dequantize_per_tensor_asym16u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const uint16_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; +} + +// int32 dequantization - uses generic template +Tensor& dequantize_per_tensor_asym32s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + float* out_data = out.mutable_data_ptr(); + size_t numel = out.numel(); + const int32_t* input_data = input.const_data_ptr(); + dequantize(out_data, input_data, scale, zero_point, numel); + return out; } -}; // namespace native -}; // namespace vision -}; // namespace impl +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp b/backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp new file mode 100644 index 00000000000..e1e468e1e33 --- /dev/null +++ b/backends/cadence/vision/operators/op_max_pool2d_with_indices.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include + +/* DMA-tiled and no-DMA maxpool executors (defined in maxpool_exec_mxnj2.c) */ +extern "C" { +typedef int XAI_ERR_TYPE; +XAI_ERR_TYPE maxpool_exec_mxnj2( + float* src, float* dst, const maxpool_layer_config_t* config); +XAI_ERR_TYPE maxpool_exec_mxnj2_no_dma( + float* src, float* dst, const maxpool_layer_config_t* config); +} + +using executorch::aten::Tensor; +using executorch::aten::ScalarType; +using executorch::aten::IntArrayRef; +using executorch::runtime::KernelRuntimeContext; +using torch::executor::Error; + +namespace impl { +namespace vision { +namespace native { + +std::tuple max_pool2d_with_indices_out( + KernelRuntimeContext& ctx, + const Tensor& in, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + Tensor& out, + Tensor& indices) { + + std::tuple ret_val(out, indices); + + ET_KERNEL_CHECK( + ctx, + torch::executor::check_max_pool2d_with_indices_args( + in, kernel_size, stride, padding, dilation, ceil_mode, out, indices), + InvalidArgument, + ret_val); + + size_t output_ndim = 0; + executorch::aten::SizesType output_sizes[executorch::runtime::kTensorDimensionLimit]; + torch::executor::get_max_pool2d_with_indices_out_target_size( + in, + kernel_size, + stride, + padding, + dilation, + ceil_mode, + output_sizes, + &output_ndim); + + ET_KERNEL_CHECK( + ctx, + torch::executor::output_size_is_valid({output_sizes, output_ndim}, 2), + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok, + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(indices, {output_sizes, output_ndim}) == Error::Ok, + InvalidArgument, + ret_val); + + // ── HW-optimized path: stride == 2 ────────────────────────────── + if (stride[0] == 2 && stride[1] == 2) { + float32_t *ptr_out = (float32_t *) out.const_data_ptr(); + const float32_t *ptr_inp = (float32_t *) in.const_data_ptr(); + int batch = in.size(0); + int channels = in.size(1); + int inp_height = in.size(2); int inp_width = in.size(3); + int out_height = out.size(2); int out_width = out.size(3); + uint8_t kernel_height = kernel_size[0]; + uint8_t kernel_width = kernel_size[1]; + + // Writeback input from cache to system memory: previous op may have written + // via CPU/cache, and maxpool's DMA kernel reads from system memory. + xthal_dcache_region_writeback((void*)ptr_inp, sizeof(float) * in.numel()); + + // Look up pre-computed config for this layer + const maxpool_layer_config_t* mp_cfg = get_maxpool_config_by_params( + channels, inp_height, inp_width, + kernel_height, kernel_width, + stride[0], stride[1], + padding[0], padding[1]); + + if (mp_cfg != NULL) { + // Check if DRAM buffers are available for DMA tiling + bool dram_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0) + && (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + for (int b = 0; b < batch; b++) { + float* batch_inp = (float*)ptr_inp + b * channels * inp_height * inp_width; + float* batch_out = (float*)ptr_out + b * channels * out_height * out_width; + + XAI_ERR_TYPE status; + if (dram_available) { + status = maxpool_exec_mxnj2(batch_inp, batch_out, mp_cfg); + } else { + status = maxpool_exec_mxnj2_no_dma(batch_inp, batch_out, mp_cfg); + } + ET_KERNEL_CHECK(ctx, status == 0, InvalidArgument, ret_val); + } + + // Invalidate output cache: executor wrote to system memory + xthal_dcache_region_invalidate(ptr_out, sizeof(float) * out.numel()); + + return ret_val; + } + } + + // ── Generic fallback: stride != 2 or no config found ────────────── + ScalarType in_type = in.scalar_type(); + ET_SWITCH_REALHBF16_TYPES( + in_type, ctx, "max_pool2d_with_indices.out", CTYPE, [&]() { + torch::executor::apply_kernel_2d_reduce_then_map_fn( + [](const CTYPE in_val, + const int64_t in_idx, + const CTYPE accum, + const int64_t accum_idx) { + if (in_val > accum) { + return std::tuple(in_val, in_idx); + } + return std::tuple(accum, accum_idx); + }, + // Max pooling does not need to post-process the accumulated output + [](const int64_t count, const CTYPE accum) { return accum; }, + /*include_pad=*/false, + in, + kernel_size, + stride, + padding, + dilation, + out, + {indices}); + }); + + + return ret_val; +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_mean.cpp b/backends/cadence/vision/operators/op_mean.cpp new file mode 100644 index 00000000000..13f946109ba --- /dev/null +++ b/backends/cadence/vision/operators/op_mean.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +/* DMA-tiled mean executor (defined in mean/mean_exec_dma.c) */ +extern "C" { +typedef int XAI_ERR_TYPE; +XAI_ERR_TYPE mean_exec_dma( + const float* src, float* dst, + int channels, int spatial_h, int spatial_w); +} + +using executorch::aten::RuntimeContext; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::ArrayRef; +using executorch::runtime::KernelRuntimeContext; +using torch::executor::Error; +using torch::executor::optional; + +namespace impl { +namespace vision { +namespace native { + +// Forward declaration of hardware-optimized mean function +extern "C" void simd_mean_pool_2x2_to_1x1_float32( + float32_t* restrict output, + const float32_t* restrict input, + int N); + +Tensor& mean_out( + RuntimeContext& ctx, + const Tensor& in, + optional> dim_list, + bool keepdim, + optional dtype, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_reduction_out(in, dim_list, keepdim, out) == + Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "mean.out"; + + // Check if we can use hardware-optimized path + // Requires: float32, specific reduction pattern (2x2 spatial to 1x1) + bool optimized = false; + + if (in.scalar_type() == ScalarType::Float && + out.scalar_type() == ScalarType::Float && + dim_list.has_value()) { + + auto dims = dim_list.value(); + int num_inp_dims = in.dim(); + + // Check for 4D tensor with reduction on last 2 dimensions (H, W) + // Input: [N, C, H, W], reduce [H, W] -> [N, C, 1, 1] + if (num_inp_dims == 4 && dims.size() == 2) { + // Normalize negative dimensions + int64_t dim0 = dims[0] < 0 ? dims[0] + num_inp_dims : dims[0]; + int64_t dim1 = dims[1] < 0 ? dims[1] + num_inp_dims : dims[1]; + + // Check if reducing dimensions 2 and 3 (H and W in NCHW format) + if ((dim0 == 2 && dim1 == 3) || (dim0 == 3 && dim1 == 2)) { + // Check if spatial dimensions are 2x2 + if (in.size(2) == 2 && in.size(3) == 2) { + optimized = true; + } + } + } + } + + if (optimized) { + + const float* input_data = in.const_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + int batch_size = in.size(0); + int channels = in.size(1); + int total_channels = batch_size * channels; + + // Check if DRAM buffers are available for DMA + bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0); + bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + size_t inp_bytes = total_channels * 4 * FLT32_SIZE; // 4 floats per channel + size_t out_bytes = total_channels * FLT32_SIZE; // 1 float per channel + + // Use DMA ping-pong tiled executor when both DRAM banks are available + bool use_dma = dram0_available && dram1_available; + + if (use_dma) { + // Writeback input from cache to system memory before DMA reads + xthal_dcache_region_writeback((void*)input_data, sizeof(float) * in.numel()); + + // Process each batch independently through the DMA executor + for (int b = 0; b < batch_size; b++) { + const float* batch_inp = input_data + b * channels * 4; + float* batch_out = out_data + b * channels; + + XAI_ERR_TYPE status = mean_exec_dma(batch_inp, batch_out, channels, 2, 2); + if (status != 0) { + // DMA executor failed (buffer too small?), fall through to SIMD path + use_dma = false; + break; + } + } + + if (use_dma) { + // Invalidate output cache: DMA wrote to system memory + xthal_dcache_region_invalidate(out_data, sizeof(float) * out.numel()); + + return out; + } + } + + // Fallback: Direct SIMD without DMA (data fits or no DRAM) + // Writeback+invalidate input: ensures CPU-dirty data reaches system memory, + // then invalidate forces re-read from system memory (fresh data) + xthal_dcache_region_writeback((void*)input_data, sizeof(float) * in.numel()); + xthal_dcache_region_invalidate((void*)input_data, sizeof(float) * in.numel()); + simd_mean_pool_2x2_to_1x1_float32(out_data, input_data, total_channels * 4); + xthal_dcache_region_writeback(out_data, sizeof(float) * out.numel()); + + return out; + } + + // Fallback to portable implementation + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const size_t num = torch::executor::get_reduced_dim_product(in, dim_list); + + for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { + CTYPE_OUT sum = 0; + if (in.numel() > 0) { + sum = torch::executor::map_reduce_over_dim_list( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + in, + dim_list, + out_ix); + } + out_data[out_ix] = sum / static_cast(num); + } + }); + }); + + + return out; +} + +Tensor& mean_dim_out( + RuntimeContext& ctx, + const Tensor& in, + optional> dim_list, + bool keepdim, + optional dtype, + Tensor& out) { + return mean_out(ctx, in, dim_list, keepdim, dtype, out); +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp index cd72d2de2b5..ceeafb98c70 100644 --- a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp +++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp @@ -1,66 +1,330 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace impl { -namespace vision { -namespace native { - -using executorch::aten::ScalarType; -using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; - -// Quantize the input tensor (PT2 version). Note that quant_ are not -// used in any computation. -void quantize_per_tensor_out( - KernelRuntimeContext& context, - const Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - const float* input_data = input.const_data_ptr(); - size_t numel = out.numel(); - - if (out.scalar_type() == ScalarType::Byte) { - uint8_t* out_data = out.mutable_data_ptr(); - kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); - } else if (out.scalar_type() == ScalarType::Char) { - int8_t* out_data = out.mutable_data_ptr(); - kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); - } else if ( - out.scalar_type() == ScalarType::Bits16 || - out.scalar_type() == ScalarType::UInt16) { - uint16_t* out_data = out.mutable_data_ptr(); - kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); - } else if (out.scalar_type() == ScalarType::Short) { - int16_t* out_data = out.mutable_data_ptr(); - kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); - } else if (out.scalar_type() == ScalarType::Int) { - int32_t* out_data = out.mutable_data_ptr(); - kernels::quantize( - out_data, input_data, 1. / scale, zero_point, numel); - } else { - ET_CHECK_MSG( - false, - "Unhandled input dtype %hhd", - static_cast(out.scalar_type())); - } -} - -}; // namespace native -}; // namespace vision -}; // namespace impl +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; +using ::impl::generic::kernels::quantize; + +namespace impl { +namespace vision { +namespace native { + +// Forward declaration of hardware-optimized quantize function +extern "C" void quantize_f32_asym8s( + int8_t* restrict ptr_out, + const float32_t* restrict ptr_inp, + float32_t scale, + int zero_bias, + int N); + +// Quantize the input tensor (PT2 version). Note that quant_ are not +// used in any computation. +Tensor& quantize_per_tensor_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + + if (out.scalar_type() == ScalarType::Byte) { + uint8_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Char) { + + int8_t* out_data = out.mutable_data_ptr(); + + // Hardware-optimized int8 quantization with DMA support + bool ping_pong_process = false; + bool ping_process_pong = false; + size_t chunk_size = 0; + + float32_t* inp_buff[2]; + int8_t* out_buff[2]; + + // Check if DRAM buffers are available + bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0); + bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + // DMA has overhead - only beneficial for larger tensors + // Threshold: 1024 elements (~4KB for float32, ~1KB for int8) + const size_t DMA_THRESHOLD = 1024; + bool use_dma = (numel >= DMA_THRESHOLD); + + // Strategy 1: Try ping-pong processing (2 input + 2 output buffers) + // Using 80/20 split: 80% for input, 20% for output in each DRAM + if (use_dma && dram0_available && dram1_available && (numel >= 2)) { + size_t inp_per_buffer = (IDMA_BUFFER_SIZE_DRAM0 * 4) / (5 * FLT32_SIZE); // 80% for float32 input + size_t out_per_buffer_dram0 = (IDMA_BUFFER_SIZE_DRAM0 * 1) / 5; // 20% for int8 output + size_t out_per_buffer_dram1 = (IDMA_BUFFER_SIZE_DRAM1 * 1) / 5; // 20% for int8 output + + // Check if 80/20 split fits in both DRAMs + if ((inp_per_buffer > 0) && + (out_per_buffer_dram0 >= inp_per_buffer) && + (out_per_buffer_dram1 >= inp_per_buffer) && + ((IDMA_BUFFER_SIZE_DRAM0 * 4) / 5 + IDMA_BUFFER_SIZE_DRAM0 / 5 <= IDMA_BUFFER_SIZE_DRAM0) && + ((IDMA_BUFFER_SIZE_DRAM1 * 4) / 5 + IDMA_BUFFER_SIZE_DRAM1 / 5 <= IDMA_BUFFER_SIZE_DRAM1)) { + + // Allocate buffers with 80/20 split + inp_buff[0] = (float32_t*)ptr_dram0; + out_buff[0] = (int8_t*)((uint8_t*)ptr_dram0 + (IDMA_BUFFER_SIZE_DRAM0 * 4) / 5); + + inp_buff[1] = (float32_t*)ptr_dram1; + out_buff[1] = (int8_t*)((uint8_t*)ptr_dram1 + (IDMA_BUFFER_SIZE_DRAM1 * 4) / 5); + + chunk_size = inp_per_buffer; + ping_pong_process = true; + } + } + + // Strategy 2: Fallback to ping-process-pong (1 input + 1 output buffer) + // Use full DRAM0 for input, full DRAM1 for output (no split needed) + if (use_dma && !ping_pong_process && dram0_available && dram1_available) { + size_t inp_capacity = IDMA_BUFFER_SIZE_DRAM0 / FLT32_SIZE; // Full DRAM0 for input + size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1; // Full DRAM1 for output + + if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) { + inp_buff[0] = (float32_t*)ptr_dram0; + out_buff[0] = (int8_t*)ptr_dram1; + + chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity; + ping_process_pong = true; + } + } + + if (ping_pong_process || ping_process_pong) { + const float32_t* ptr_inp = (float32_t*)input_data; + + // Writeback input from cache to system memory before DMA reads + xthal_dcache_region_writeback((void*)input_data, FLT32_SIZE * numel); + + /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */ + dma_2dm_init(0); + dma_2dm_init(1); + + if (ping_pong_process) { + // Ping-pong processing for better throughput + size_t num_chunks = (numel + chunk_size - 1) / chunk_size; + + if (num_chunks == 0) num_chunks = 1; + + int32_t pp_swap = 0; + + float32_t* ptr_in = (float32_t*)ptr_inp; + int8_t* ptr_out = out_data; + + // Load first chunk via ch0 + size_t current_chunk = (numel < chunk_size) ? numel : chunk_size; + + dma_1dm(0, ptr_in, inp_buff[pp_swap], FLT32_SIZE * current_chunk); + + size_t remaining = numel - current_chunk; + ptr_in += current_chunk; + + // Pipeline: load (ch0) and store (ch1) overlap with processing + for (size_t i = 0; i < (num_chunks - 1); i++) { + size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Wait for current load to complete + idma_hw_wait_all(0); + + // Start loading next chunk into alternate buffer via ch0 + dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], FLT32_SIZE * next_chunk); + + // Process current chunk (ch0 loads next in parallel) + quantize_f32_asym8s(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk); + + // Wait for previous store to complete before reusing out_buff + idma_hw_wait_all(1); + + // Store result via ch1 + dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(int8_t) * current_chunk); + + ptr_in += next_chunk; + ptr_out += current_chunk; + remaining -= next_chunk; + current_chunk = next_chunk; + pp_swap ^= 1; + } + + // Process last chunk + idma_hw_wait_all(0); + quantize_f32_asym8s(out_buff[pp_swap], inp_buff[pp_swap], (float)scale, (int)zero_point, (int)current_chunk); + + idma_hw_wait_all(1); + dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(int8_t) * current_chunk); + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, sizeof(int8_t) * numel); + + } + else if (ping_process_pong) { + // Simple sequential processing + size_t remaining = numel; + float32_t* ptr_in = (float32_t*)ptr_inp; + int8_t* ptr_out = out_data; + + while (remaining > 0) { + size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Start load via ch0 (overlaps with any pending ch1 store) + dma_1dm(0, ptr_in, inp_buff[0], FLT32_SIZE * current_chunk); + // Wait for previous store to complete (out_buff[0] safe to write) + idma_hw_wait_all(1); + // Wait for load to complete + idma_hw_wait_all(0); + + // Process + quantize_f32_asym8s(out_buff[0], inp_buff[0], (float)scale, (int)zero_point, (int)current_chunk); + + // Store via ch1 + dma_1dm(1, out_buff[0], ptr_out, sizeof(int8_t) * current_chunk); + + ptr_in += current_chunk; + ptr_out += current_chunk; + remaining -= current_chunk; + } + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, sizeof(int8_t) * numel); + + } + + } else { + // No DMA: use hardware function on full tensor at once + // Writeback+invalidate input: ensures CPU-dirty data reaches system memory, + // then invalidate forces re-read from system memory (fresh data) + xthal_dcache_region_writeback((void*)input_data, FLT32_SIZE * numel); + xthal_dcache_region_invalidate((void*)input_data, FLT32_SIZE * numel); + quantize_f32_asym8s(out_data, input_data, (float)scale, (int)zero_point, (int)numel); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(out_data, sizeof(int8_t) * numel); + + } + + } else if ( + out.scalar_type() == ScalarType::Bits16 || + out.scalar_type() == ScalarType::UInt16) { + uint16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Short) { + int16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + } else if (out.scalar_type() == ScalarType::Int) { + int32_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(out.scalar_type())); + } + return out; +} + +// int8 quantization - uses generic template +Tensor& quantize_per_tensor_asym8s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int8_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +// uint8 quantization - uses generic template +Tensor& quantize_per_tensor_asym8u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + uint8_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +// int16 quantization - uses generic template +Tensor& quantize_per_tensor_asym16s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +// uint16 quantization - uses generic template +Tensor& quantize_per_tensor_asym16u_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + uint16_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +// int32 quantization - uses generic template +Tensor& quantize_per_tensor_asym32s_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) { + const float* input_data = input.const_data_ptr(); + size_t numel = out.numel(); + int32_t* out_data = out.mutable_data_ptr(); + quantize(out_data, input_data, 1. / scale, zero_point, numel); + return out; +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp index be4b34bff03..0d47331c367 100644 --- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp @@ -6,8 +6,25 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include +#include +#include +#include +#include +#include #include +#include + +// Forward declaration of conv_execute_kernel (defined in conv_kernel_dispatcher.c) +extern "C" { +typedef int XAI_ERR_TYPE; +XAI_ERR_TYPE conv_execute_kernel( + int8_t* src, + int8_t* dst, + int8_t* coeff_ptr, + int8_t* bias_ptr, + const conv_layer_config_t* config); +} namespace impl { namespace vision { @@ -141,7 +158,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( if (quantized) { float val = bias_scale * acc; out_plane[_oh * ow + _ow] = - kernels::quantize(val, inv_out_scale, out_zero_point); + ::impl::generic::kernels::quantize(val, inv_out_scale, out_zero_point); } else { out_plane[_oh * ow + _ow] = acc; } @@ -267,7 +284,7 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic( if (quantized) { float val = bias_scale * acc; out_line[_oc] = - kernels::quantize(val, inv_out_scale, out_zero_point); + ::impl::generic::kernels::quantize(val, inv_out_scale, out_zero_point); } else { out_line[_oc] = acc; } @@ -296,6 +313,7 @@ void quantized_conv_nchw( float output_scale, int32_t output_zero_point, Tensor& out) { + bool conv1d = input.dim() == 3; // input = [n, c, h, w] const int n = input.size(0); @@ -344,13 +362,232 @@ void quantized_conv_nchw( } ScalarType dtype = out.scalar_type(); switch (dtype) { - ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nchw); + case ScalarType::Char: { + const conv_layer_config_t* config_const = get_layer_config_by_params( + c, h, w, // ic, ih, iw + oc, wh, ww, // oc, kh, kw + oh, ow, // oh, ow + stride[0], stride[1], // sy, sx + padding[0], dilation[0]); // pad, dil + + // Make a mutable local copy — the static const table may reside in + // read-only memory (.rodata), so writing through const_cast is undefined + // behavior and silently fails on Xtensa targets. + conv_layer_config_t config_local; + conv_layer_config_t* config = NULL; + float effective_scale = 0.0f; + if (config_const != NULL) { + config_local = *config_const; // shallow copy all fields + config = &config_local; + + // DMA path for all layers ≥ 4×4 spatial; generic C fallback for ≤ 2×2. + // + // XAI kernel pipeline: out = (acc >> accumShift) * outputScale >> outputShift + // The kernel saturates the shifted accumulator to int16 [-32768, 32767] + // after accumShift, so accumShift must be chosen to keep accumulators in range. + effective_scale = bias_scale / output_scale; + } + + if(config != NULL) { + config->input_zero_point = static_cast(in_zero_point); + + // Disable in-kernel ReLU — ExecuTorch applies ReLU as a separate op. + config->relu_min = -128; + config->relu_max = 127; + + // Bias correction: absorb input_zero_point and output_zero_point + // into the kernel bias to avoid the double-clamp problem. + // Also clamp to 24-bit range (ACC_INIT_BIAS takes lower 24 bits); + // any residual beyond 24-bit is applied as post-kernel correction. + const int32_t* bias_orig = bias.const_data_ptr(); + const int8_t* wt_data = weight.const_data_ptr(); + const int wt_per_oc = weight.numel() / oc; + + static const int32_t BIAS_24BIT_MAX = 8388607; // (1 << 23) - 1 + static const int32_t BIAS_24BIT_MIN = -8388608; // -(1 << 23) + + // output_zero_point expressed in accumulator domain + int64_t zp_acc_corr = 0; + if (output_zero_point != 0 && effective_scale > 0.0f) { + double zp_d = static_cast(output_zero_point) / effective_scale; + zp_acc_corr = static_cast(zp_d >= 0.0 ? zp_d + 0.5 : zp_d - 0.5); + } + + // Per-channel split bias: kernel_bias (24-bit safe) + post_correction + int32_t kernel_bias[2048]; + int32_t post_correction[2048]; + int64_t max_abs_kernel_bias = 0; + for (int o = 0; o < oc; o++) { + int32_t w_sum = 0; + const int8_t* wt_oc = wt_data + o * wt_per_oc; + for (int i = 0; i < wt_per_oc; i++) { + w_sum += wt_oc[i]; + } + int64_t bias_corr_64 = static_cast(bias_orig[o]) + - static_cast(in_zero_point) * w_sum; + + int64_t target_bias = bias_corr_64 + zp_acc_corr; + + int32_t kb; + if (target_bias > BIAS_24BIT_MAX) { + kb = BIAS_24BIT_MAX; + } else if (target_bias < BIAS_24BIT_MIN) { + kb = BIAS_24BIT_MIN; + } else { + kb = static_cast(target_bias); + } + kernel_bias[o] = kb; + + int64_t abs_kb = kb >= 0 ? kb : -static_cast(kb); + if (abs_kb > max_abs_kernel_bias) max_abs_kernel_bias = abs_kb; + + int64_t bias_residual = target_bias - kb; + float resid_float = static_cast(bias_residual) * effective_scale; + int32_t resid_int = static_cast(resid_float >= 0.0f + ? resid_float + 0.5f : resid_float - 0.5f); + post_correction[o] = resid_int; + } + + // accumShift: ensure (acc >> accSh) fits in int16 after PACK. + // Tight bound from actual weight L1 norms instead of worst-case 128*128*P. + // max_acc = |bias| + sum(|weight_i|) * 128 since inputs are int8 (magnitude ≤ 128). + // Compute max sum(|weights|) across all output channels + int64_t max_weight_l1 = 0; + for (int o = 0; o < oc; o++) { + const int8_t* wt_oc = wt_data + o * wt_per_oc; + int64_t w_l1 = 0; + for (int i = 0; i < wt_per_oc; i++) { + w_l1 += (wt_oc[i] >= 0) ? wt_oc[i] : -wt_oc[i]; + } + if (w_l1 > max_weight_l1) max_weight_l1 = w_l1; + } + + // Tight max accumulator bound: bias + L1(weights) * max_input_magnitude + float max_acc = static_cast(max_abs_kernel_bias) + + static_cast(max_weight_l1) * 128.0f; + + int accum_shift = 0; + while (max_acc / static_cast(1LL << accum_shift) > 32767.0f + && accum_shift < 31) { + accum_shift++; + } + + config->accum_shift = accum_shift; + + // outputShift & outputScale: maximize precision within uint16 range. + int best_shift = 15; + int64_t total_shift = static_cast(accum_shift) + best_shift; + int32_t raw_scale = static_cast( + effective_scale * static_cast(1LL << total_shift)); + if (raw_scale > 65535) { + // Scale too large for uint16_t, reduce outputShift until it fits + while (best_shift > 0 && raw_scale > 65535) { + best_shift--; + total_shift = static_cast(accum_shift) + best_shift; + raw_scale = static_cast( + effective_scale * static_cast(1LL << total_shift)); + } + } else if (raw_scale < 16384 && best_shift < 31) { + // Scale too small, increase outputShift for better precision + while (best_shift < 31) { + int64_t trial_total = static_cast(accum_shift) + best_shift + 1; + if (trial_total > 62) break; // avoid 1LL << overflow + int32_t trial = static_cast( + effective_scale * static_cast(1LL << trial_total)); + if (trial > 65535) break; + best_shift++; + raw_scale = trial; + } + } + if (raw_scale <= 0) raw_scale = 1; + if (raw_scale > 65535) raw_scale = 65535; + + config->output_shift = best_shift; + config->output_scale = raw_scale; + + // CPU-computed kernel_bias resides only in cache; + // DMA bypasses cache and reads system memory, so writeback is needed. + xthal_dcache_region_writeback( + reinterpret_cast(kernel_bias), + oc * sizeof(int32_t)); + + // Writeback input and weight from cache to system memory before DMA reads + xthal_dcache_region_writeback( + const_cast(input.const_data_ptr()), + n * c * h * w * sizeof(int8_t)); + xthal_dcache_region_writeback( + const_cast(weight.const_data_ptr()), + weight.numel() * sizeof(int8_t)); + + XAI_ERR_TYPE kern_status = conv_execute_kernel( + const_cast(input.const_data_ptr()), + out.mutable_data_ptr(), + const_cast(weight.const_data_ptr()), + reinterpret_cast(kernel_bias), + config); + (void)kern_status; + + // Invalidate cache for DMA-written output so post-correction + // and next operator see fresh data instead of stale cache lines + xthal_dcache_region_invalidate( + out.mutable_data_ptr(), + n * oc * oh * ow * sizeof(int8_t)); + + // Apply post-kernel residual correction + bool has_correction = false; + for (int _n = 0; _n < n; _n++) { + for (int _oc = 0; _oc < oc; _oc++) { + int32_t corr = post_correction[_oc]; + if (corr == 0) continue; + has_correction = true; + int8_t* ch_out = out.mutable_data_ptr() + (_n * oc * oh * ow + _oc * oh * ow); + for (int _s = 0; _s < oh * ow; _s++) { + int32_t val = static_cast(ch_out[_s]) + corr; + val = val < -128 ? -128 : (val > 127 ? 127 : val); + ch_out[_s] = static_cast(val); + } + } + } + + // If post_correction modified output via CPU/cache, writeback to system memory + if (has_correction) { + xthal_dcache_region_writeback( + out.mutable_data_ptr(), + n * oc * oh * ow * sizeof(int8_t)); + } + + break; + } + // Fall through to generic implementation + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, c, h, w, + oc, wc, wh, ww, + oh, ow, + stride[0], stride[1], + padding[0], padding[1], + dilation[0], dilation[1], + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + (int8_t)output_zero_point); + break; + } + // Handle uint8_t (Byte) case - previously covered by ET_FORALL_CADENCE_QUANTIZED_TYPES + // Note: Char (int8_t) is handled explicitly above with optimized kernel + typed_quantized_conv2d_nchw(uint8_t, Byte); default: ET_DCHECK_MSG( false, "Unhandled dtype %s", torch::executor::toString(dtype)); } #undef typed_quantized_conv2d_nchw + } void quantized_conv_nhwc( @@ -582,7 +819,6 @@ void quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, - ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { quantized_conv_per_tensor_out( ctx, @@ -678,6 +914,158 @@ void quantized_conv2d_nhwc_out( out); } +void quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv_nchw( + input, weight, bias, stride, padding, dilation, groups, + in_zero_point, weight_zero_point, bias_scale, output_scale, + output_zero_point, out); +} + } // namespace native } // namespace vision + +// The codegen dispatches to impl::generic::native:: namespace. +// Forward to the vision::native implementation. +namespace generic { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +void quantized_conv2d_nhwc_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out) { + ::impl::vision::native::quantized_conv_per_tensor_out( + ctx, input, weight, bias, stride, padding, dilation, groups, + in_zero_point, weight_zero_point, bias_scale, output_scale, + output_zero_point, out_multiplier, out_shift, + true, // channel_last = true for NHWC + out); +} + +void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + ::impl::vision::native::quantized_conv_nchw( + input, weight, bias, stride, padding, dilation, groups, + in_zero_point, weight_zero_point, bias_scale, output_scale, + output_zero_point, out); +} + +void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + ::impl::vision::native::quantized_conv_nchw( + input, weight, bias, stride, padding, dilation, groups, + in_zero_point, weight_zero_point, bias_scale, output_scale, + output_zero_point, out); +} + +void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + ::impl::vision::native::quantized_conv_nchw( + input, weight, bias, stride, padding, dilation, groups, + in_zero_point, weight_zero_point, bias_scale, output_scale, + output_zero_point, out); +} + +void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + ::impl::vision::native::quantized_conv_nchw( + input, weight, bias, stride, padding, dilation, groups, + in_zero_point, weight_zero_point, bias_scale, output_scale, + output_zero_point, out); +} + +} // namespace native +} // namespace generic } // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp index b6b7cdd17bc..7b579fb8d0d 100644 --- a/backends/cadence/vision/operators/op_quantized_linear_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp @@ -6,18 +6,65 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include +#include +#include +#include +#include +#include #include +#include namespace impl { namespace vision { namespace native { +using executorch::aten::ScalarType; using executorch::aten::Tensor; using executorch::runtime::getLeadingDims; using executorch::runtime::KernelRuntimeContext; +// Generic fallback implementation +template +void quantized_linear_per_tensor_generic_( + const Tensor& src, + const Tensor& weight, + const Tensor& bias, + const int64_t src_zero_point, + const int64_t weight_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + const int64_t out_zero_point, + Tensor& out) { + + const int64_t leading_dims = getLeadingDims(src, src.dim() - 1); + const int64_t out_dim = weight.size(0); + const int64_t in_dim = weight.size(1); + + const T* __restrict__ in_data = src.const_data_ptr(); + const T* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + T* __restrict__ out_data = out.mutable_data_ptr(); + + // Compute the requant_scale from out_multiplier and out_shift + const float requant_scale = + -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift); + + for (size_t i = 0; i < leading_dims; ++i) { + for (size_t j = 0; j < out_dim; ++j) { + int32_t sum = bias_data[j]; + for (size_t k = 0; k < in_dim; ++k) { + int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point; + int32_t w = (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point; + sum += x * w; + } + + out_data[i * out_dim + j] = + ::impl::generic::kernels::quantize(sum, requant_scale, out_zero_point); + } + } +} + +// Upstream-style quantized_linear_out with tensor-based zero points template void inline _typed_quantized_linear( const Tensor& src, @@ -36,15 +83,9 @@ void inline _typed_quantized_linear( int32_t weight_zero_point = weight_zero_point_t.const_data_ptr()[0]; - // input comes in shape [batch_size, in_dim] - // weight comes in shape [out_dim, in_dim] - // output comes in empty with shape [batch_size, out_dim] - // Perform matrix multiply (M x N) x (N x P) => M x P const auto M = weight.size(0); // = out_dim const auto N = weight.size(1); // = in_dim - // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the - // leading dimensions is d0 * d1 * ... * d_{N-2} const auto leading_dims = getLeadingDims(src, src.dim() - 1); ET_CHECK_MSG( @@ -69,7 +110,7 @@ void inline _typed_quantized_linear( (weight_data[j * N + k] - weight_zero_point); } out_data[i * M + j] = - kernels::quantize(sum, out_scale, out_zero_point); + impl::generic::kernels::quantize(sum, out_scale, out_zero_point); } } } @@ -86,29 +127,14 @@ void quantized_linear_out( int64_t out_zero_point, __ET_UNUSED const executorch::aten::optional& offset, Tensor& out) { - // TODO: refactor to use switch case as quantized_linear_per_tensor_out if (out.scalar_type() == executorch::aten::ScalarType::Byte) { _typed_quantized_linear( - src, - weight, - bias, - src_zero_point, - weight_zero_point_t, - out_multiplier, - out_shift, - out_zero_point, - out); + src, weight, bias, src_zero_point, weight_zero_point_t, + out_multiplier, out_shift, out_zero_point, out); } else if (out.scalar_type() == executorch::aten::ScalarType::Char) { _typed_quantized_linear( - src, - weight, - bias, - src_zero_point, - weight_zero_point_t, - out_multiplier, - out_shift, - out_zero_point, - out); + src, weight, bias, src_zero_point, weight_zero_point_t, + out_multiplier, out_shift, out_zero_point, out); } else { ET_CHECK_MSG( false, @@ -117,6 +143,7 @@ void quantized_linear_out( } } +// Optimized quantized_linear_per_tensor_out with DMA and SIMD support void quantized_linear_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& src, @@ -127,33 +154,168 @@ void quantized_linear_per_tensor_out( const int64_t out_multiplier, const int64_t out_shift, const int64_t out_zero_point, - __ET_UNUSED const executorch::aten::optional& offset, + __ET_UNUSED const std::optional& offset, Tensor& out) { -#define typed_quantized_linear_per_tensor(ctype, dtype) \ - case executorch::aten::ScalarType::dtype: { \ - quantized_linear_per_tensor_( \ - src, \ - weight, \ - bias, \ - src_zero_point, \ - weight_zero_point, \ - out_multiplier, \ - out_shift, \ - out_zero_point, \ - out); \ - break; \ + + + const int64_t leading_dims = getLeadingDims(src, src.dim() - 1); + const int64_t out_dim = weight.size(0); + const int64_t in_dim = weight.size(1); + const size_t numel = leading_dims * out_dim; + + bool use_optimized = false; + if (src.scalar_type() == ScalarType::Char && + weight.scalar_type() == ScalarType::Char && + out.scalar_type() == ScalarType::Char && + in_dim >= 16) { + use_optimized = true; } - executorch::aten::ScalarType dtype = out.scalar_type(); - switch (dtype) { - ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear_per_tensor); - default: - ET_DCHECK_MSG( - false, "Unhandled dtype %s", executorch::runtime::toString(dtype)); + if (use_optimized) { + const int8_t* in_data = src.const_data_ptr(); + const int8_t* weight_data = weight.const_data_ptr(); + const int32_t* bias_data = bias.const_data_ptr(); + int8_t* out_data = out.mutable_data_ptr(); + + const int32_t in_zp = static_cast(src_zero_point); + const int32_t weight_zp = static_cast(weight_zero_point); + const int32_t out_zp = static_cast(out_zero_point); + + // Compute requant scale + const float requant_scale = + -out_multiplier * 1.0f / (1 << 31) * std::pow(2.0f, (float)out_shift); + + // Check if DRAM buffers are available for DMA + bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0); + bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + // DMA threshold: only beneficial for larger problems + const size_t DMA_THRESHOLD = 512; + bool use_dma = (in_dim >= DMA_THRESHOLD) && dram0_available && dram1_available; + + if (use_dma && leading_dims == 1) { + // Single sample: DMA-optimized tiling (block prefetch) processing + size_t input_buffer_size = in_dim; + size_t max_tile_rows = IDMA_BUFFER_SIZE_DRAM1 / in_dim; + if (max_tile_rows == 0) max_tile_rows = 1; + size_t tile_rows = (max_tile_rows < out_dim) ? max_tile_rows : out_dim; + + int8_t* input_cache = (int8_t*)ptr_dram0; + int8_t* weight_tile = (int8_t*)ptr_dram1; + + xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * src.numel()); + xthal_dcache_region_writeback((void*)weight_data, sizeof(int8_t) * weight.numel()); + + dma_2dm_init(0); + int32_t idx_in = idma_copy_2d_desc(0, input_cache, (void*)in_data, + input_buffer_size, DESC_IDMA_PRIOR_H, 1, 0, 0); + idma_desc_done(0, idx_in); + + for (size_t j_tile = 0; j_tile < out_dim; j_tile += tile_rows) { + size_t curr_tile = ((j_tile + tile_rows) <= out_dim) ? tile_rows : (out_dim - j_tile); + int32_t idx_weight = idma_copy_2d_desc(0, weight_tile, (void*)(weight_data + j_tile * in_dim), + curr_tile * in_dim, DESC_IDMA_PRIOR_H, 1, 0, 0); + idma_desc_done(0, idx_weight); + + for (size_t j = 0; j < curr_tile; ++j) { + int32_t acc = bias_data[j_tile + j]; + acc = rvdot_zeropt( + acc, input_cache, weight_tile + j * in_dim, + in_zp, weight_zp, (int)in_dim); + out_data[j_tile + j] = ::impl::generic::kernels::quantize(acc, requant_scale, out_zp); + } + } + + xthal_dcache_region_writeback(out_data, sizeof(int8_t) * numel); + + return; + } + + // Fallback: No DMA or multi-sample - use direct SIMD + xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * src.numel()); + xthal_dcache_region_invalidate((void*)in_data, sizeof(int8_t) * src.numel()); + xthal_dcache_region_writeback((void*)weight_data, sizeof(int8_t) * weight.numel()); + xthal_dcache_region_invalidate((void*)weight_data, sizeof(int8_t) * weight.numel()); + xthal_dcache_region_writeback((void*)bias_data, sizeof(int32_t) * bias.numel()); + xthal_dcache_region_invalidate((void*)bias_data, sizeof(int32_t) * bias.numel()); + + for (size_t i = 0; i < leading_dims; ++i) { + const int8_t* in_row = &in_data[i * in_dim]; + for (size_t j = 0; j < out_dim; ++j) { + const int8_t* weight_row = &weight_data[j * in_dim]; + int32_t acc = bias_data[j]; + acc = rvdot_zeropt( + acc, in_row, weight_row, + in_zp, weight_zp, (int)in_dim); + out_data[i * out_dim + j] = + ::impl::generic::kernels::quantize(acc, requant_scale, out_zp); + } + } + + xthal_dcache_region_writeback(out_data, sizeof(int8_t) * numel); + + + } else { + // Fallback: use generic implementation + if (out.scalar_type() == ScalarType::Char) { + quantized_linear_per_tensor_generic_( + src, weight, bias, + src_zero_point, weight_zero_point, + out_multiplier, out_shift, out_zero_point, out); + } else if (out.scalar_type() == ScalarType::Byte) { + quantized_linear_per_tensor_generic_( + src, weight, bias, + src_zero_point, weight_zero_point, + out_multiplier, out_shift, out_zero_point, out); + } else { + ET_CHECK_MSG( + false, + "Unhandled output dtype %hhd", + static_cast(out.scalar_type())); + } + } -#undef typed_quantized_linear_per_tensor } -}; // namespace native -}; // namespace vision -}; // namespace impl +// Wrapper functions for different quantization schemes +void quantized_linear_asym8sxasym8s_asym8s_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& src, + const Tensor& weight, + const Tensor& bias, + const int64_t src_zero_point, + const int64_t weight_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + const int64_t out_zero_point, + const std::optional& offset, + Tensor& out) { + quantized_linear_per_tensor_out( + ctx, src, weight, bias, + src_zero_point, weight_zero_point, + out_multiplier, out_shift, out_zero_point, + offset, out); +} + +void quantized_linear_asym8uxasym8u_asym8u_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& src, + const Tensor& weight, + const Tensor& bias, + const int64_t src_zero_point, + const int64_t weight_zero_point, + const int64_t out_multiplier, + const int64_t out_shift, + const int64_t out_zero_point, + const std::optional& offset, + Tensor& out) { + quantized_linear_per_tensor_out( + ctx, src, weight, bias, + src_zero_point, weight_zero_point, + out_multiplier, out_shift, out_zero_point, + offset, out); +} + +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_quantized_relu_out.cpp b/backends/cadence/vision/operators/op_quantized_relu_out.cpp index 45b9e09b1dd..812c33873ed 100644 --- a/backends/cadence/vision/operators/op_quantized_relu_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_relu_out.cpp @@ -6,109 +6,348 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include +#include +#include +#include +#include +#include #include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +// Forward declaration of Vision SIMD quantized ReLU +extern "C" void vrelU( + uint8_t* y, + const int8_t* x, + const uint8_t minVal, + uint8_t maxVal, + int N); + +#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \ + _(uint8_t, Byte) \ + _(int8_t, Char) namespace impl { namespace vision { namespace native { -using executorch::aten::Tensor; -using executorch::runtime::KernelRuntimeContext; - +// Generic fallback implementation (from generic/operators/quantized_relu_out.cpp) template -void quantized_relu_( +void quantized_relu_per_tensor_out_( + __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, - const Tensor& in_zero_point, - const int64_t out_zero_point, - const Tensor& out_multiplier, - const Tensor& out_shift, + int64_t in_zero_point, + int64_t out_zero_point, + int64_t out_multiplier, + int64_t out_shift, Tensor& output) { - T q_zero_point = in_zero_point.const_data_ptr()[0]; const T* __restrict__ in = input.const_data_ptr(); T* __restrict__ out = output.mutable_data_ptr(); - const int32_t* __restrict__ out_multiplier_data = - out_multiplier.const_data_ptr(); - const int32_t* __restrict__ out_shift_data = - out_shift.const_data_ptr(); - // Compute the out_scale from out_multiplier and out_shift - const float out_scale = - -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); + const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift); for (size_t i = 0, e = input.numel(); i < e; ++i) { - const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0; - out[i] = kernels::quantize(temp, out_scale, out_zero_point); + const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0; + out[i] = generic::kernels::quantize(temp, out_scale, out_zero_point); } } -void quantized_relu_out( + +void quantized_relu_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, - const Tensor& in_zero_point, + const int64_t in_zero_point, const int64_t out_zero_point, - const Tensor& out_multiplier, - const Tensor& out_shift, + const int64_t out_multiplier, + const int64_t out_shift, Tensor& output) { - if (input.scalar_type() == executorch::aten::ScalarType::Byte) { - quantized_relu_( - input, - in_zero_point, - out_zero_point, - out_multiplier, - out_shift, - output); - } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { - quantized_relu_( - input, - in_zero_point, - out_zero_point, - out_multiplier, - out_shift, - output); + + + size_t numel = input.numel(); + + // Check if we can use Vision SIMD path for quantized data + // vrelU supports int8/uint8 input and output (with appropriate casting) + bool use_optimized = (input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte) && + (output.scalar_type() == ScalarType::Char || output.scalar_type() == ScalarType::Byte) && + (numel >= 16); + + if (use_optimized) { + // Vision-optimized SIMD path using vrelU with iDMA support + // vrelU requires int8_t* input and uint8_t* output, cast appropriately + const int8_t* in_data; + if (input.scalar_type() == ScalarType::Char) { + in_data = input.const_data_ptr(); + } else { + in_data = reinterpret_cast(input.const_data_ptr()); + } + + uint8_t* out_data; + if (output.scalar_type() == ScalarType::Byte) { + out_data = output.mutable_data_ptr(); + } else { + out_data = reinterpret_cast(output.mutable_data_ptr()); + } + + // For quantized operations and dumps, we need int8_t view of output + int8_t* out_data_int8 = reinterpret_cast(out_data); + + // vrelU clamps: max(max(x, 0), minVal) and min(result, maxVal) + uint8_t minVal = 0; // ReLU minimum is 0 + uint8_t maxVal = 255; // uint8 max + + // Common quantization parameters (used by both DMA and non-DMA paths) + const float out_scale = -out_multiplier * 1.0f / (1 << 31) * std::pow(2.0f, (float)out_shift); + const int32_t in_zp = static_cast(in_zero_point); + const int32_t out_zp = static_cast(out_zero_point); + + // DMA setup + bool ping_pong_process = false; + bool ping_process_pong = false; + size_t chunk_size = 0; + + int8_t* inp_buff[2]; + uint8_t* out_buff[2]; + + // Check if DRAM buffers are available + bool dram0_available = (ptr_dram0 != nullptr) && (IDMA_BUFFER_SIZE_DRAM0 > 0); + bool dram1_available = (ptr_dram1 != nullptr) && (IDMA_BUFFER_SIZE_DRAM1 > 0); + + // DMA has overhead - only beneficial for larger tensors + // Threshold: 1024 elements (~1KB for int8 input/output) + const size_t DMA_THRESHOLD = 1024; + bool use_dma = (numel >= DMA_THRESHOLD); + + // Strategy 1: Try ping-pong processing (2 input + 2 output buffers) + // Using 50/50 split: both int8/uint8 are 1 byte each + if (use_dma && dram0_available && dram1_available && (numel >= 2)) { + size_t per_buffer = (IDMA_BUFFER_SIZE_DRAM0 / 2); // 50% for int8 input (in bytes) + + // Check if 50/50 split fits in both DRAMs + if ((per_buffer > 0) && + ((IDMA_BUFFER_SIZE_DRAM0 / 2 + IDMA_BUFFER_SIZE_DRAM0 / 2) <= IDMA_BUFFER_SIZE_DRAM0) && + ((IDMA_BUFFER_SIZE_DRAM1 / 2 + IDMA_BUFFER_SIZE_DRAM1 / 2) <= IDMA_BUFFER_SIZE_DRAM1)) { + + // Allocate buffers with 50/50 split + inp_buff[0] = (int8_t*)ptr_dram0; + out_buff[0] = (uint8_t*)((uint8_t*)ptr_dram0 + (IDMA_BUFFER_SIZE_DRAM0 / 2)); + + inp_buff[1] = (int8_t*)ptr_dram1; + out_buff[1] = (uint8_t*)((uint8_t*)ptr_dram1 + (IDMA_BUFFER_SIZE_DRAM1 / 2)); + + chunk_size = per_buffer; + ping_pong_process = true; + } + } + + // Strategy 2: Fallback to ping-process-pong (1 input + 1 output buffer) + // Use full DRAM0 for input, full DRAM1 for output (no split needed) + if (use_dma && !ping_pong_process && dram0_available && dram1_available) { + size_t inp_capacity = IDMA_BUFFER_SIZE_DRAM0; // Full DRAM0 for int8 input (in bytes) + size_t out_capacity = IDMA_BUFFER_SIZE_DRAM1; // Full DRAM1 for uint8 output (in bytes) + + if ((inp_capacity > 0) && (out_capacity >= inp_capacity)) { + inp_buff[0] = (int8_t*)ptr_dram0; + out_buff[0] = (uint8_t*)ptr_dram1; + + chunk_size = (inp_capacity < out_capacity) ? inp_capacity : out_capacity; + ping_process_pong = true; + } + } + + if (ping_pong_process || ping_process_pong) { + const int8_t* ptr_inp = in_data; + + // Writeback input from cache to system memory before DMA reads + xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * numel); + + /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */ + dma_2dm_init(0); + dma_2dm_init(1); + + if (ping_pong_process) { + // Ping-pong processing for better throughput + size_t num_chunks = (numel + chunk_size - 1) / chunk_size; + + if (num_chunks == 0) num_chunks = 1; + + int32_t pp_swap = 0; + + int8_t* ptr_in = (int8_t*)ptr_inp; + uint8_t* ptr_out = out_data; + + // Load first chunk via ch0 + size_t current_chunk = (numel < chunk_size) ? numel : chunk_size; + + dma_1dm(0, ptr_in, inp_buff[pp_swap], sizeof(int8_t) * current_chunk); + + size_t remaining = numel - current_chunk; + ptr_in += current_chunk; + + // Pipeline: load (ch0) and store (ch1) overlap with processing + for (size_t i = 0; i < (num_chunks - 1); i++) { + size_t next_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Wait for current load to complete + idma_hw_wait_all(0); + + // Start loading next chunk into alternate buffer via ch0 + dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], sizeof(int8_t) * next_chunk); + + // Process current chunk (ch0 loads next in parallel) + int8_t* out_chunk_int8 = reinterpret_cast(out_buff[pp_swap]); + vrelU_quantized(out_chunk_int8, inp_buff[pp_swap], in_zp, out_zp, out_scale, (int)current_chunk); + + // Wait for previous store to complete before reusing out_buff + idma_hw_wait_all(1); + + // Store result via ch1 + dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(uint8_t) * current_chunk); + + ptr_in += next_chunk; + ptr_out += current_chunk; + remaining -= next_chunk; + current_chunk = next_chunk; + pp_swap ^= 1; + } + + // Process last chunk + idma_hw_wait_all(0); + int8_t* out_last_int8 = reinterpret_cast(out_buff[pp_swap]); + vrelU_quantized(out_last_int8, inp_buff[pp_swap], in_zp, out_zp, out_scale, (int)current_chunk); + + idma_hw_wait_all(1); + dma_1dm(1, out_buff[pp_swap], ptr_out, sizeof(uint8_t) * current_chunk); + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, sizeof(uint8_t) * numel); + + } + else if (ping_process_pong) { + // Simple sequential processing + size_t remaining = numel; + int8_t* ptr_in = (int8_t*)ptr_inp; + uint8_t* ptr_out = out_data; + + while (remaining > 0) { + size_t current_chunk = (remaining < chunk_size) ? remaining : chunk_size; + + // Start load via ch0 (overlaps with any pending ch1 store) + dma_1dm(0, ptr_in, inp_buff[0], sizeof(int8_t) * current_chunk); + // Wait for previous store to complete (out_buff[0] safe to write) + idma_hw_wait_all(1); + // Wait for load to complete + idma_hw_wait_all(0); + + // Process + int8_t* out_chunk_int8 = reinterpret_cast(out_buff[0]); + vrelU_quantized(out_chunk_int8, inp_buff[0], in_zp, out_zp, out_scale, (int)current_chunk); + + // Store via ch1 + dma_1dm(1, out_buff[0], ptr_out, sizeof(uint8_t) * current_chunk); + + ptr_in += current_chunk; + ptr_out += current_chunk; + remaining -= current_chunk; + } + idma_hw_wait_all(1); + + // Invalidate output cache: DMA wrote to system memory, cache may have stale data + xthal_dcache_region_invalidate(out_data, sizeof(uint8_t) * numel); + + } + } else { + // Fallback: use SIMD function directly without DMA + // Writeback+invalidate input: ensures CPU-dirty data reaches system memory, + // then invalidate forces re-read from system memory (fresh data) + xthal_dcache_region_writeback((void*)in_data, sizeof(int8_t) * numel); + xthal_dcache_region_invalidate((void*)in_data, sizeof(int8_t) * numel); + // Use common parameters already computed above + + vrelU_quantized( + out_data_int8, + in_data, + in_zp, + out_zp, + out_scale, + (int)numel); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(out_data, sizeof(uint8_t) * numel); + + } + } else { - ET_CHECK_MSG( - false, - "Unhandled input dtype %hhd", - static_cast(input.scalar_type())); + // Fallback: use generic implementation with template dispatching + +#define typed_quantized_relu(ctype, dtype) \ + case executorch::aten::ScalarType::dtype: { \ + quantized_relu_per_tensor_out_( \ + ctx, \ + input, \ + in_zero_point, \ + out_zero_point, \ + out_multiplier, \ + out_shift, \ + output); \ + break; \ + } + + executorch::aten::ScalarType dtype = input.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu) + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_relu + } } +// Per-channel quantized_relu_out (with Tensor parameters) template -void quantized_relu_per_tensor_out_( - __ET_UNUSED KernelRuntimeContext& ctx, +void quantized_relu_( const Tensor& input, - const int64_t in_zero_point, + const Tensor& in_zero_point, const int64_t out_zero_point, - const int64_t out_multiplier, - const int64_t out_shift, + const Tensor& out_multiplier, + const Tensor& out_shift, Tensor& output) { + T q_zero_point = in_zero_point.const_data_ptr()[0]; const T* __restrict__ in = input.const_data_ptr(); T* __restrict__ out = output.mutable_data_ptr(); - // Compute the out_scale from out_multiplier and out_shift - const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift); + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + const float out_scale = + -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); for (size_t i = 0, e = input.numel(); i < e; ++i) { - const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0; - out[i] = kernels::quantize(temp, out_scale, out_zero_point); + const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0; + out[i] = generic::kernels::quantize(temp, out_scale, out_zero_point); } } -void quantized_relu_per_tensor_out( +Tensor& quantized_relu_out( KernelRuntimeContext& ctx, const Tensor& input, - const int64_t in_zero_point, + const Tensor& in_zero_point, const int64_t out_zero_point, - const int64_t out_multiplier, - const int64_t out_shift, + const Tensor& out_multiplier, + const Tensor& out_shift, Tensor& output) { -#define typed_quantized_relu(ctype, dtype) \ +#define typed_quantized_relu_ch(ctype, dtype) \ case executorch::aten::ScalarType::dtype: { \ - quantized_relu_per_tensor_out_( \ - ctx, \ + quantized_relu_( \ input, \ in_zero_point, \ out_zero_point, \ @@ -120,15 +359,16 @@ void quantized_relu_per_tensor_out( executorch::aten::ScalarType dtype = input.scalar_type(); switch (dtype) { - ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu) + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu_ch) default: ET_DCHECK_MSG( false, "Unhandled dtype %s", torch::executor::toString(dtype)); } -#undef typed_quantized_relu +#undef typed_quantized_relu_ch + return output; } -}; // namespace native -}; // namespace vision -}; // namespace impl +} // namespace native +} // namespace vision +} // namespace impl diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp index 58ca33c6a0b..3faf2bcd307 100644 --- a/backends/cadence/vision/operators/op_softmax.cpp +++ b/backends/cadence/vision/operators/op_softmax.cpp @@ -6,14 +6,11 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include #include #include -#include -#include -#include using executorch::aten::ScalarType; using executorch::aten::Tensor; @@ -30,6 +27,8 @@ Tensor& _softmax_out( int64_t dim, bool half_to_float, Tensor& out) { + + (void)ctx; ET_KERNEL_CHECK( @@ -42,9 +41,9 @@ Tensor& _softmax_out( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); ET_KERNEL_CHECK( - ctx, - executorch::runtime::tensors_have_same_dim_order(in, out), - InvalidArgument, + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, out); // Adjust for negative dim @@ -65,11 +64,73 @@ Tensor& _softmax_out( bool ping_pong_process = false; bool ping_process_pong = false; - if ((d == in.dim() - 1)) { - if (size <= IDMA_BUFF_SIZE / 4 && in.dim() != 1) { - ping_pong_process = true; - } else if (size <= IDMA_BUFF_SIZE / 2) { - ping_process_pong = true; + float32_t *inp_buff[2]; + float32_t *out_buff[2]; + + if ((d == in.dim() - 1)){ + if ((4 * FLT32_SIZE * size <= (IDMA_BUFFER_SIZE_DRAM0 + IDMA_BUFFER_SIZE_DRAM1)) && (in.dim() != 1)){ + // For ping-pong processing we need to have enough buffer to hold 2 input and 2 output blocks + if (2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && 2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){ + // Both DRAM0 and DRAM1 can hold 2 input and 2 output blocks + inp_buff[0] = (float32_t *)ptr_dram0; + inp_buff[1] = (float32_t *)ptr_dram1; + out_buff[0] = (float32_t *)(ptr_dram0) + size; + out_buff[1] = (float32_t *)(ptr_dram1) + size; + ping_pong_process = true; + } + else if (4 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0){ + // DRAM0 can hold 2 input and 2 output blocks + inp_buff[0] = (float32_t *)ptr_dram0; + inp_buff[1] = (float32_t *)(ptr_dram0) + size; + out_buff[0] = (float32_t *)(ptr_dram0) + 2 * size; + out_buff[1] = (float32_t *)(ptr_dram0) + 3 * size; + ping_pong_process = true; + } + else if (4 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){ + // DRAM1 can hold 2 input and 2 output blocks + inp_buff[0] = (float32_t *)ptr_dram1; + inp_buff[1] = (float32_t *)(ptr_dram1) + size; + out_buff[0] = (float32_t *)(ptr_dram1) + 2 * size; + out_buff[1] = (float32_t *)(ptr_dram1) + 3 * size; + ping_pong_process = true; + } + else if (3 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){ + // DRAM0 can hold 2 output and 1 input blocks, DRAM1 can hold 1 input block + inp_buff[0] = (float32_t *)ptr_dram0; + inp_buff[1] = (float32_t *)ptr_dram1; + out_buff[0] = (float32_t *)(ptr_dram0) + size; + out_buff[1] = (float32_t *)(ptr_dram0) + 2 * size; + ping_pong_process = true; + } + else if (FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && 3 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){ + // DRAM1 can hold 2 output and 1 input blocks, DRAM0 can hold 1 input block + inp_buff[0] = (float32_t *)ptr_dram0; + inp_buff[1] = (float32_t *)ptr_dram1; + out_buff[0] = (float32_t *)(ptr_dram1) + size; + out_buff[1] = (float32_t *)(ptr_dram1) + 2 * size; + ping_pong_process = true; + } + } + else if (2 * FLT32_SIZE * size <= (IDMA_BUFFER_SIZE_DRAM0 + IDMA_BUFFER_SIZE_DRAM1)){ + // For ping-process-pong we need to have enough buffer to hold 1 input and 1 output block + if (FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0 && FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){ + // Both DRAM0 and DRAM1 can hold 1 input and 1 output block + inp_buff[0] = (float32_t *)ptr_dram0; + out_buff[0] = (float32_t *)ptr_dram1; + ping_process_pong = true; + } + else if (2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM0){ + // DRAM0 can hold 1 input and 1 output block + inp_buff[0] = (float32_t *)ptr_dram0; + out_buff[0] = (float32_t *)(ptr_dram0) + size; + ping_process_pong = true; + } + else if (2 * FLT32_SIZE * size <= IDMA_BUFFER_SIZE_DRAM1){ + // DRAM1 can hold 1 input and 1 output block + inp_buff[0] = (float32_t *)ptr_dram1; + out_buff[0] = (float32_t *)(ptr_dram1) + size; + ping_process_pong = true; + } } } @@ -79,20 +140,16 @@ Tensor& _softmax_out( if (in.dim() > MaxDim) optimized = false; - if (optimized) { - const float* ptr_inp = (float*)in.const_data_ptr(); - float* out_data = (float*)out.mutable_data_ptr(); - - /* Channel 0*/ - idma_init(0, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL); - idma_init_loop(0, descbuf[0], IDMA_2D_DESC, 1, NULL, NULL); + if (optimized){ + const float32_t *ptr_inp = (float32_t *)in.const_data_ptr(); + float32_t *out_data = (float32_t *)out.mutable_data_ptr(); - /* Channel 1*/ - idma_init(1, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL); - idma_init_loop(1, descbuf[1], IDMA_2D_DESC, 1, NULL, NULL); + /* Initialize DMA Channel 0 (loads) and Channel 1 (stores) */ + dma_2dm_init(0); + dma_2dm_init(1); if (ping_pong_process) { - for (int i = 0; i < in.dim(); i++) { + for (int i = 0; i < in.dim(); i++){ if (i != d) outer_size *= in.size(i); } @@ -100,60 +157,47 @@ Tensor& _softmax_out( outer_stride = size; stride = size; - int pp_swap = 0; + int32_t pp_swap = 0; - float32_t* ptr_out = out_data; - float32_t* ptr_in = (float32_t*)ptr_inp; + float32_t *ptr_out = out_data; + float32_t *ptr_in = (float32_t *) ptr_inp; - idma_copy_2d_desc( - 0, inpData[pp_swap], ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); - pp_swap = 1; + // Load first chunk via ch0 + dma_1dm(0, ptr_in, inp_buff[pp_swap], 4 * stride); - for (int i = 0; i < (outer_size - 1); i++) { - IDMA_HW_WAIT_ALL(0); - ptr_in += outer_stride; - idma_copy_2d_desc( - 0, - inpData[pp_swap], - ptr_in, - 4 * stride, - DESC_IDMA_PRIOR_H, - 1, - 0, - 0); - pp_swap = pp_swap ^ 1; - - /* PROCESS CALL */ - vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride); - - IDMA_HW_WAIT_ALL(1); - idma_copy_2d_desc( - 1, - ptr_out, - outData[pp_swap], - 4 * stride, - DESC_IDMA_PRIOR_H, - 1, - 0, - 0); - ptr_out += outer_stride; - } + for (int i = 0; i < (outer_size - 1); i++){ + // Wait for current load to complete + idma_hw_wait_all(0); + + ptr_in += outer_stride; + // Start loading next chunk into alternate buffer via ch0 + dma_1dm(0, ptr_in, inp_buff[pp_swap ^ 1], 4 * stride); + + /* PROCESS CALL */ + vsoftmaxf(out_buff[pp_swap], inp_buff[pp_swap], stride); - IDMA_HW_WAIT_ALL(0); - pp_swap = pp_swap ^ 1; + // Wait for previous store to complete before reusing out_buff + idma_hw_wait_all(1); + + // Store result via ch1 + dma_1dm(1, out_buff[pp_swap], ptr_out, 4 * stride); + ptr_out += outer_stride; + + pp_swap ^= 1; + } - /* PROCESS CALL */ - vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride); + // Process last chunk + idma_hw_wait_all(0); + vsoftmaxf(out_buff[pp_swap], inp_buff[pp_swap], stride); - IDMA_HW_WAIT_ALL(1); - idma_copy_2d_desc( - 1, ptr_out, outData[pp_swap], 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); + idma_hw_wait_all(1); + dma_1dm(1, out_buff[pp_swap], ptr_out, 4 * stride); + idma_hw_wait_all(1); - IDMA_HW_WAIT_ALL(1); return out; } else if (ping_process_pong) { - for (int i = 0; i < in.dim(); i++) { + for (int i = 0; i < in.dim(); i++){ if (i != d) outer_size *= in.size(i); } @@ -161,23 +205,27 @@ Tensor& _softmax_out( outer_stride = size; stride = size; - float32_t* ptr_out = out_data; - float32_t* ptr_in = (float32_t*)ptr_inp; + float32_t *ptr_out = out_data; + float32_t *ptr_in = (float32_t *) ptr_inp; - for (int i = 0; i < outer_size; i++) { - idma_copy_2d_desc( - 0, data_dram0, ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); - IDMA_HW_WAIT_ALL(0); + for (int i = 0; i < outer_size; i++){ + // Start load via ch0 (overlaps with any pending ch1 store) + dma_1dm(0, ptr_in, inp_buff[0], 4 * stride); + // Wait for previous store to complete + idma_hw_wait_all(1); + // Wait for load to complete + idma_hw_wait_all(0); - vsoftmaxf(data_dram1, data_dram0, stride); + vsoftmaxf(out_buff[0], inp_buff[0], stride); - idma_copy_2d_desc( - 1, ptr_out, data_dram1, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0); - IDMA_HW_WAIT_ALL(1); + // Store via ch1 + dma_1dm(1, out_buff[0], ptr_out, 4 * stride); ptr_in += outer_stride; - ptr_out += outer_stride; - } + ptr_out += outer_stride; + } + idma_hw_wait_all(1); + return out; } else { @@ -207,45 +255,51 @@ Tensor& _softmax_out( outer_stride = size; - float* ptr_out = (float*)kernels::allocate_temp_memory( - ctx, out.numel() * sizeof(float)); + executorch::runtime::Result temp_mem_res = ctx.allocate_temp(out.numel() * sizeof(float)); + float* ptr_out = + (float*)(temp_mem_res.ok() ? temp_mem_res.get() : nullptr); ET_KERNEL_CHECK(ctx, ptr_out != nullptr, MemoryAllocationFailed, out); - float* ptr_out1 = (float*)kernels::allocate_temp_memory( - ctx, out.numel() * sizeof(float)); + executorch::runtime::Result temp_mem_res1 = ctx.allocate_temp(out.numel() * sizeof(float)); + float* ptr_out1 = + (float*)(temp_mem_res1.ok() ? temp_mem_res1.get() : nullptr); ET_KERNEL_CHECK(ctx, ptr_out1 != nullptr, MemoryAllocationFailed, out); tensor_transposef( - ptr_out, - ptr_out_shape, - ptr_inp, - ptr_inp_shape, - ptr_permute_vec, - num_out_dims, - num_inp_dims); + ptr_out, + ptr_out_shape, + ptr_inp, + ptr_inp_shape, + ptr_permute_vec, + num_out_dims, + num_inp_dims); for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { size_t outer = outer_idx * outer_stride; for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) { size_t base = outer + inner_idx; - - float* ptr_in_data = &ptr_out[base]; - float* ptr_out_data = &ptr_out1[base]; + + float *ptr_in_data = &ptr_out[base]; + float *ptr_out_data = &ptr_out1[base]; vsoftmaxf(ptr_out_data, ptr_in_data, size); } } tensor_transposef( - out_data, - ptr_inp_shape, - ptr_out1, - ptr_out_shape, - ptr_permute_vec, - num_out_dims, - num_inp_dims); + out_data, + ptr_inp_shape, + ptr_out1, + ptr_out_shape, + ptr_permute_vec, + num_out_dims, + num_inp_dims); + + // Writeback output from cache to system memory for DMA coherency + xthal_dcache_region_writeback(out_data, sizeof(float) * in.numel()); + return out; } @@ -270,13 +324,13 @@ Tensor& _softmax_out( size, stride); - const CTYPE temp_sum = + const CTYPE temp_sum = torch::executor::apply_unary_map_reduce_fn( [max_in](const CTYPE val_in) { - return std::exp(val_in - max_in); + return std::exp(val_in - max_in); }, [](const CTYPE mapped_in, CTYPE val_accum) { - return val_accum + mapped_in; + return val_accum + mapped_in; }, in_data + base, size, @@ -295,6 +349,7 @@ Tensor& _softmax_out( dim); }); + return out; } diff --git a/backends/cadence/vision/third-party/CMakeLists.txt b/backends/cadence/vision/third-party/CMakeLists.txt new file mode 100644 index 00000000000..12530d95322 --- /dev/null +++ b/backends/cadence/vision/third-party/CMakeLists.txt @@ -0,0 +1,101 @@ +cmake_minimum_required(VERSION 3.10.0) +project(cadence_vision) + +# Collect all source files from the library directory +file(GLOB_RECURSE VISION_LIB_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/library/api/*.c" + "${CMAKE_CURRENT_SOURCE_DIR}/library/tables/*.c" + "${CMAKE_CURRENT_SOURCE_DIR}/library/dma.c" + "${CMAKE_CURRENT_SOURCE_DIR}/library/memory_manager.c" + "${CMAKE_CURRENT_SOURCE_DIR}/library/utils.c" +) + +# Create the vision library +add_library(xa_nnlib STATIC ${VISION_LIB_SOURCES}) + +# Set include directories +target_include_directories(xa_nnlib PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_include_directories(xa_nnlib PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include_private +) + +# Set properties for the library +set_target_properties(xa_nnlib PROPERTIES + OUTPUT_NAME "xa_nnlib" + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin" +) + +# Create output directories +file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin") + +# ============================================================================ +# libxai_common - Common utilities and data types +# ============================================================================ +set(LIBXAI_COMMON_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src/xai_buildinfo.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src/xai_errstr.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src/cnn_cast.c +) + +set(LIBXAI_COMMON_INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/include + ${CMAKE_CURRENT_SOURCE_DIR}/libxai_common/src +) + +add_library(xai_common STATIC ${LIBXAI_COMMON_SOURCES}) +target_include_directories(xai_common PUBLIC ${LIBXAI_COMMON_INCLUDE_DIRS}) +# Allow XAI kernels to operate on system memory (not just local DRAM). +# Required for cache-mode convolution variants that pass system memory pointers. +target_compile_definitions(xai_common PUBLIC SYS_MEM_TESTING=1) + +# ============================================================================ +# libxai - CNN kernels library +# ============================================================================ +set(LIBXAI_SOURCES + # Main convolution dispatcher (contains xaiConvolved3D) + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv.c + # Convolution dispatcher and variants + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_VQ.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_MOD.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_MOW.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_conv_SO.c + # Dilated convolution VQ variants + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c + # Dilated convolution non-VQ variants + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOW.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOD.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_dilated_conv_SO.c + # Data transform and helpers + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_datatransform.c + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src/cnn_helper.c +) + +set(LIBXAI_INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/include + ${CMAKE_CURRENT_SOURCE_DIR}/libxai/cnn/src +) + +add_library(xai STATIC ${LIBXAI_SOURCES}) +target_include_directories(xai PUBLIC ${LIBXAI_INCLUDE_DIRS}) +target_include_directories(xai PRIVATE ${LIBXAI_COMMON_INCLUDE_DIRS}) +target_link_libraries(xai PUBLIC xai_common) + +# ============================================================================ +# Export variables for parent CMakeLists.txt +# ============================================================================ +set(XAI_INCLUDE_DIRS + ${LIBXAI_INCLUDE_DIRS} + ${LIBXAI_COMMON_INCLUDE_DIRS} + CACHE INTERNAL "XAI include directories" +) + +set(XAI_LIBRARIES xai xai_common CACHE INTERNAL "XAI libraries") diff --git a/backends/cadence/vision/third-party/dummy.c b/backends/cadence/vision/third-party/dummy.c deleted file mode 100644 index 52fb7c18c38..00000000000 --- a/backends/cadence/vision/third-party/dummy.c +++ /dev/null @@ -1,17 +0,0 @@ -/* Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* Dummy source file for non-Xtensa builds - * This file is used when building the vision-nnlib library on platforms - * other than Xtensa, providing empty stubs for compatibility. - * The actual function implementations are provided as stubs via DISCARD_FUN - * in headers when COMPILER_XTENSA is not defined. - */ - -// This file intentionally contains no function definitions and no includes. -// When COMPILER_XTENSA is not defined, all functions are stubbed out -// using the DISCARD_FUN macro in the header files. diff --git a/backends/cadence/vision/third-party/include/api.h b/backends/cadence/vision/third-party/include/api.h index efb80c3d76d..b89ab2ac263 100644 --- a/backends/cadence/vision/third-party/include/api.h +++ b/backends/cadence/vision/third-party/include/api.h @@ -69,12 +69,65 @@ N multiple of BBE_SIMD_WIDTH (vsoftmax) void vsoftmaxf(float32_t *y, const float32_t *x, int N); void tensor_transposef(float32_t *restrict ptr_out - ,const int *const ptr_out_shape - ,const float32_t *restrict ptr_inp - ,const int *const ptr_inp_shape - ,const int *restrict ptr_permute_vec - ,int num_out_dims - ,int num_inp_dims); + ,const int *const ptr_out_shape + ,const float32_t *restrict ptr_inp + ,const int *const ptr_inp_shape + ,const int *restrict ptr_permute_vec + ,int num_out_dims + ,int num_inp_dims); + +void quantize_f32_asym8s(int8_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,float32_t scale + ,int zero_bias + ,int N); + +void dequantize_asym8s_f32(float32_t *restrict ptr_out + ,const int8_t *restrict ptr_inp + ,float32_t scale + ,int zero_bias + ,int N); + +void maxpool2d_with_indices_j2x2_f32(float32_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,int *restrict ptr_indices + ,int inp_height ,int inp_width + ,int out_height ,int out_width + ,int32_t in_pitch_width, int32_t in_pitch_height + ,int32_t out_pitch_width, int32_t out_pitch_height + ,uint8_t kernel_height + ,uint8_t kernel_width); + +void maxpool2d_j2x2_f32(float32_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,int inp_height ,int inp_width + ,int out_height ,int out_width + ,int32_t in_pitch_width, int32_t in_pitch_height + ,int32_t out_pitch_width, int32_t out_pitch_height + ,uint8_t kernel_height + ,uint8_t kernel_width); + +void vrelU_quantized( + int8_t* restrict ptr_out, + const int8_t* restrict ptr_inp, + int32_t in_zero_point, + int32_t out_zero_point, + float32_t out_scale, + int N); +void rvaddf(float32_t *restrict z, const float32_t *restrict x, + const float32_t *restrict y, int N); + +void simd_mean_pool_2x2_to_1x1_float32(float32_t* restrict output, + const float32_t* restrict input, + int N); + +int32_t rvdot_zeropt( + int32_t init_acc, + const int8_t *restrict x, + const int8_t *restrict y, + int8_t x_zp, + int8_t y_zp, + int N); #ifdef __cplusplus }; diff --git a/backends/cadence/vision/third-party/include/dma.h b/backends/cadence/vision/third-party/include/dma.h new file mode 100644 index 00000000000..6e368bccd91 --- /dev/null +++ b/backends/cadence/vision/third-party/include/dma.h @@ -0,0 +1,42 @@ +/* + * dma.h + * + * Created on: Oct 30, 2025 + * Author: sraut + */ + +#ifndef __DMA_H__ +#define __DMA_H__ + +// Enable DMA for cache-mode input copy (instead of xaiCopyTile3D) +// NOTE: Requires AXI-to-AXI DMA support on the target core +// Uncomment to use DMA 3D transfer in cache executors +// #define USE_DMA_FOR_CACHE_COPY + +#define IDMA_USE_INTR 0 +#define IDMA_USE_MULTICHANNEL 1 +#define CHL_MAX 2 +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// DMA initialization functions +void dma_2dm_init(int ch); +void dma_3dm_init(int ch); + +// DMA transfer functions +void dma_1dm(int ch,void *_psrc,void *_pdst, int num_bytes); +void dma_2dm(int ch, void *_psrc, void *_pdst, int src_stride, int dst_stride, + int num_bytes, short num_lines); +void dma_3dm(int ch, void *src, void *dst, int src_row_pitch, int dst_row_pitch, + int src_tile_pitch, int dst_tile_pitch, int row_sz, + int nrows, int ntiles) ; + +#ifdef __cplusplus +} +#endif + +#endif /* __DMA_H__ */ diff --git a/backends/cadence/vision/third-party/include/dtypes.h b/backends/cadence/vision/third-party/include/dtypes.h index c12bbf23ac2..1b94594d216 100644 --- a/backends/cadence/vision/third-party/include/dtypes.h +++ b/backends/cadence/vision/third-party/include/dtypes.h @@ -164,23 +164,38 @@ #define inline_ static inline #endif +#ifndef MAX_FLT32 +#define MAX_FLT32 (3.402823466e+38F) +#endif +#ifndef MIN_FLT32 +#define MIN_FLT32 (- MAX_FLT32) +#endif +#ifndef MIN_ABS_FLT32 +#define MIN_ABS_FLT32 (1.175494351e-38F) +#endif +#ifndef MAX_INT8 +#define MAX_INT8 (0x7f) +#endif +#ifndef MIN_INT8 +#define MIN_INT8 (- MAX_INT8 - 1) +#endif #ifndef MAX_INT16 -#define MAX_INT16 ((int16_t)0x7FFF) +#define MAX_INT16 (0x7FFF) #endif #ifndef MIN_INT16 -#define MIN_INT16 ((int16_t)0x8000) +#define MIN_INT16 (0x8000) #endif #ifndef MAX_INT32 -#define MAX_INT32 ((int32_t)0x7FFFFFFFL) +#define MAX_INT32 (0x7FFFFFFFL) #endif #ifndef MIN_INT32 -#define MIN_INT32 ((int32_t)0x80000000L) +#define MIN_INT32 (0x80000000L) #endif #ifndef MIN_INT64 -#define MIN_INT64 ((int64_t)0x8000000000000000LL) +#define MIN_INT64 (0x8000000000000000LL) #endif #ifndef MAX_INT64 -#define MAX_INT64 ((int64_t)0x7fffffffffffffffLL) +#define MAX_INT64 (0x7fffffffffffffffLL) #endif /* size of variables in bytes */ @@ -190,6 +205,22 @@ #define SIZEOF_BYTE(x) sizeof(x) #endif +#ifndef FLT32_SIZE +#define FLT32_SIZE 4 +#endif +#ifndef INT8_SIZE +#define INT8_SIZE 1 +#endif +#ifndef INT16_SIZE +#define INT16_SIZE 2 +#endif +#ifndef INT32_SIZE +#define INT32_SIZE 4 +#endif +#ifndef INT64_SIZE +#define INT64_SIZE 8 +#endif + /*--------------------------------------- special keywords definition restrict keyword means that the memory diff --git a/backends/cadence/vision/third-party/include/dump_tensor.h b/backends/cadence/vision/third-party/include/dump_tensor.h new file mode 100644 index 00000000000..ab2bb219289 --- /dev/null +++ b/backends/cadence/vision/third-party/include/dump_tensor.h @@ -0,0 +1,70 @@ +/* + * Dump output tensor data after each operator for layer-by-layer comparison. + * Include with: #include + * + * Output format: + * LAYER_DUMP : : : dtype= : first=[v0,v1,...] : sum= : min= : max= + * + * Compare generic vs optimized: + * grep LAYER_DUMP generic.log > gen_dump.txt + * grep LAYER_DUMP opt.log > opt_dump.txt + * diff gen_dump.txt opt_dump.txt + */ +#pragma once + +#include +#include +#include + +/* ScalarType values: Byte=0, Char=1, Short=2, Int=3, Long=4, Half=5, Float=6 */ + +#define _DUMP_N 16 /* number of leading values to print */ + +#define DUMP_TENSOR(name, tensor) do { \ + const auto _dn = (tensor).numel(); \ + const int _dt = (int)(tensor).scalar_type(); \ + printf("LAYER_DUMP : %s : %d : dtype=%d", #name, (int)_dn, _dt); \ + if (_dt == 6) { /* Float */ \ + const float* _dp = (tensor).const_data_ptr(); \ + int _k = (int)_dn < _DUMP_N ? (int)_dn : _DUMP_N; \ + printf(" : first=["); \ + for (int _i = 0; _i < _k; _i++) printf("%s%.6f", _i?",":"", _dp[_i]); \ + printf("]"); \ + double _sum = 0; float _lo = _dp[0], _hi = _dp[0]; \ + for (int _i = 0; _i < (int)_dn; _i++) { \ + _sum += _dp[_i]; \ + if (_dp[_i] < _lo) _lo = _dp[_i]; \ + if (_dp[_i] > _hi) _hi = _dp[_i]; \ + } \ + printf(" : sum=%.4f : min=%.6f : max=%.6f", _sum, _lo, _hi); \ + } else if (_dt == 1) { /* Char / int8 */ \ + const int8_t* _dp = (tensor).const_data_ptr(); \ + int _k = (int)_dn < _DUMP_N ? (int)_dn : _DUMP_N; \ + printf(" : first=["); \ + for (int _i = 0; _i < _k; _i++) printf("%s%d", _i?",":"", (int)_dp[_i]); \ + printf("]"); \ + int64_t _sum = 0; int _lo = _dp[0], _hi = _dp[0]; \ + for (int _i = 0; _i < (int)_dn; _i++) { \ + _sum += _dp[_i]; \ + if (_dp[_i] < _lo) _lo = _dp[_i]; \ + if (_dp[_i] > _hi) _hi = _dp[_i]; \ + } \ + printf(" : sum=%lld : min=%d : max=%d", (long long)_sum, _lo, _hi); \ + } else if (_dt == 0) { /* Byte / uint8 */ \ + const uint8_t* _dp = (tensor).const_data_ptr(); \ + int _k = (int)_dn < _DUMP_N ? (int)_dn : _DUMP_N; \ + printf(" : first=["); \ + for (int _i = 0; _i < _k; _i++) printf("%s%u", _i?",":"", (unsigned)_dp[_i]); \ + printf("]"); \ + int64_t _sum = 0; int _lo = _dp[0], _hi = _dp[0]; \ + for (int _i = 0; _i < (int)_dn; _i++) { \ + _sum += _dp[_i]; \ + if ((int)_dp[_i] < _lo) _lo = _dp[_i]; \ + if ((int)_dp[_i] > _hi) _hi = _dp[_i]; \ + } \ + printf(" : sum=%lld : min=%d : max=%d", (long long)_sum, _lo, _hi); \ + } else { \ + printf(" : (unsupported dtype)"); \ + } \ + printf("\n"); \ +} while(0) diff --git a/backends/cadence/vision/third-party/include/lib.h b/backends/cadence/vision/third-party/include/lib.h new file mode 100644 index 00000000000..4a7e31ee92b --- /dev/null +++ b/backends/cadence/vision/third-party/include/lib.h @@ -0,0 +1,72 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ + +#ifndef __LIB_H__ +#define __LIB_H__ + +#include "dtypes.h" +#include "api.h" +#include + +#include "dma.h" +#include "memory_manager.h" +#include "utils.h" + +#if defined COMPILER_XTENSA + +#include +#include +#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH + +// Performance measurement macros +#define XTPERF_PRINTF(...) printf(__VA_ARGS__) +#define TIME_DECL(test) long start_time_##test, end_time_##test; +#define TIME_START(test) { start_time_##test = XT_RSR_CCOUNT(); } +#define TIME_END(test) { end_time_##test = XT_RSR_CCOUNT(); } +#define TIME_DISPLAY(test, opcnt, opname) { long long cycles_##test = end_time_##test - start_time_##test; \ + XTPERF_PRINTF("PERF_LOG : %s : %d : %s : %lld : cycles : %.2f : %s/cycle : %.2f : cycles/%s\n", \ + #test, opcnt, opname, cycles_##test, cycles_##test == 0 ? 0 : (double)(opcnt)/cycles_##test, \ + opname, cycles_##test == 0 ? 0 : 1/((double)(opcnt)/cycles_##test), opname); } + + +// // IDMA Initializations and declarations +// #if XCHAL_HAVE_IDMA +// #ifndef IDMA_USE_MULTICHANNEL +// #define IDMA_USE_MULTICHANNEL 1 +// #endif +// #ifndef CHL_MAX +// #define CHL_MAX 2 +// #endif +// #include +// #endif + +// #ifndef DRAM0_BUFF_SIZE // To be defined at compile time +// #error "DRAM0_BUFF_SIZE not defined" +// #endif + +// #ifndef DRAM1_BUFF_SIZE // To be defined at compile time +// #error "DRAM1_BUFF_SIZE not defined" +// #endif + +// #ifndef PLACE_IN_DRAM0 +// #define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data"))) +// #endif + +// #ifndef PLACE_IN_DRAM1 +// #define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data"))) +// #endif + +extern void *ptr_dram0; +extern void *ptr_dram1; + +#endif // COMPILER_XTENSA + +#endif // __LIB_H__ \ No newline at end of file diff --git a/backends/cadence/vision/third-party/include/memory_manager.h b/backends/cadence/vision/third-party/include/memory_manager.h new file mode 100644 index 00000000000..5430d075042 --- /dev/null +++ b/backends/cadence/vision/third-party/include/memory_manager.h @@ -0,0 +1,69 @@ +/* + * memory_manager.h + * + * Created on: Nov 6, 2025 + * Author: sraut + * + * Description: Dynamic memory allocator for DRAM0, DRAM1, and local SRAM regions + * Provides simple arena-style allocation with 64-byte alignment + */ + +#ifndef MEMORY_MANAGER_H_ +#define MEMORY_MANAGER_H_ + +#include +#include +#include +#include "../../operators/layer_configs.h" // For IDMA_BUFFER_SIZE_DRAM0/DRAM1 + +// ============================================================================ +// Memory Configuration +// ============================================================================ + +// Cache-mode padded input buffer size (in system memory) +// Must fit the largest padded input tensor for cache-mode layers +// For ResNet layers that don't fit in DRAM tiling (e.g., 56x56x128) +#ifndef CACHE_PADDED_INPUT_SIZE +#define CACHE_PADDED_INPUT_SIZE (1024 * 1024) // 1 MB max +#endif + +// ============================================================================ +// Dynamic Memory Allocator for DRAM0 and DRAM1 +// ============================================================================ + +// Memory pools placed in specific DRAM sections +// Declared extern here, defined in memory_manager.c +extern uint8_t dram0_pool[IDMA_BUFFER_SIZE_DRAM0]; +extern uint8_t dram1_pool[IDMA_BUFFER_SIZE_DRAM1]; + +// Cache-mode padded input buffer (in system memory) +// Used by cache-mode kernels for edge padding +extern int8_t cache_padded_input[CACHE_PADDED_INPUT_SIZE]; + +/** + * @brief Allocate DRAM buffer with SIMD alignment + * @param size Size in bytes to allocate + * @param dram_bank Which DRAM bank (0 or 1) + * @param dram0_used Pointer to current dram0 usage counter + * @param dram1_used Pointer to current dram1 usage counter + * @return Pointer to allocated buffer + */ +int8_t* allocate_dram_buffer(int size, int dram_bank, int* dram0_used, int* dram1_used); + +/** + * @brief Get pointer to cache-mode padded input buffer + * @return Pointer to the padded input buffer (aligned, in system memory) + */ +static inline int8_t* get_cache_padded_input(void) { + return cache_padded_input; +} + +/** + * @brief Get size of cache-mode padded input buffer + * @return Size in bytes + */ +static inline size_t get_cache_padded_input_size(void) { + return CACHE_PADDED_INPUT_SIZE; +} + +#endif /* MEMORY_MANAGER_H_ */ diff --git a/backends/cadence/vision/third-party/include/utils.h b/backends/cadence/vision/third-party/include/utils.h new file mode 100644 index 00000000000..eb659c291c8 --- /dev/null +++ b/backends/cadence/vision/third-party/include/utils.h @@ -0,0 +1,182 @@ +/* + * utils.h + * + * Created on: Nov 4, 2025 + * Author: sraut + */ + +#ifndef UTILS_H_ +#define UTILS_H_ + +#include +#include +#include "../libxai_common/include/xai_tile_manager.h" + + +/** + * @brief Increment iterator to temp with carry + * @param temp Pointer to temporary variable + * @param var Current value + * @param bound Upper bound + * @param carry Carry value + * @return New carry value + */ + +// required for windows +#undef min +#undef max +static inline int min(int a, int b) { return a < b ? a : b; } +static inline int max(int a, int b) { return a > b ? a : b; } + + +static inline int inc_iter_to_temp(int *temp, int var, int bound, int carry) { + int new_val = var + carry; + carry = new_val == bound; + *temp = carry ? 0 : new_val; + return carry; +} + +/** + * @brief Swap two uint8_t buffer pointers + * @param a Pointer to first buffer pointer + * @param b Pointer to second buffer pointer + */ +static inline void swap_buffers(int8_t **a, int8_t **b) { + int8_t *t = *a; + *a = *b; + *b = t; +} + +static inline void _proto_FillBuffer_I8(void *buff, int val, unsigned size) { + + unsigned its = size / (2 * XCHAL_IVPN_SIMD_WIDTH); + unsigned rem = size % (2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 *pDst = (xb_vec2Nx8 *)buff; + valign vaDst = IVP_ZALIGN(); + xb_vec2Nx8 pattern = IVP_MOVVA8(val); + for (unsigned i = 0; i < its; i++) { + IVP_SAV2NX8_XP(pattern, vaDst, pDst, 2 * XCHAL_IVPN_SIMD_WIDTH); + } + IVP_SAV2NX8_XP(pattern, vaDst, pDst, rem); + IVP_SAPOS2NX8_FP(vaDst, pDst); +} + +/** + * @brief Setup a tile3D descriptor for cache-mode input tile + * + * This initializes a tile3D structure pointing to a local buffer with + * proper dimensions, edges, and pitches for convolution operations. + * Used by cache-mode executors where input is copied to SRAM scratch buffer. + * + * @param tile Pointer to tile3D descriptor to initialize + * @param buffer Pointer to data buffer (in SRAM) + * @param dim1_size Width without padding + * @param dim2_size Height without padding + * @param dim3_size Channels + * @param edge1 Edge padding on left/top + * @param edge2 Edge padding on right/bottom + * @param stride_alignment Pitch alignment (typically 2*XCHAL_IVPN_SIMD_WIDTH) + */ +static inline void setup_tile3d_cache_input( + xai_tile3D* tile, + int8_t* buffer, + int dim1_size, // Width (W) + int dim2_size, // Height (H) + int dim3_size, // Channels (D) + int dim1_edge1, // Left edge + int dim1_edge2, // Right edge + int dim2_edge1, // Top edge + int dim2_edge2, // Bottom edge + int dim3_edge1, // Channel edge start + int dim3_edge2, // Channel edge end + int stride_alignment // Pitch alignment +) { + // Calculate padded dimensions + int padded_dim1 = dim1_size + dim1_edge1 + dim1_edge2; + int padded_dim2 = dim2_size + dim2_edge1 + dim2_edge2; + int padded_dim3 = dim3_size + dim3_edge1 + dim3_edge2; + + // Calculate aligned pitch for dim1 + int dim1_pitch = padded_dim1; + if (stride_alignment > 0) { + dim1_pitch = (padded_dim1 + stride_alignment - 1) & ~(stride_alignment - 1); + } + + // Calculate pitch for dim2 + int dim2_pitch = dim1_pitch * padded_dim2; + + // Calculate total buffer size + int buffer_size = dim2_pitch * padded_dim3; + + // Initialize tile descriptor + XAI_TILE3D_SET_BUFF_PTR(tile, buffer); + XAI_TILE3D_SET_BUFF_SIZE(tile, buffer_size); + XAI_TILE3D_SET_DATA_PTR(tile, buffer + (dim3_edge1 * dim2_pitch) + + (dim2_edge1 * dim1_pitch) + + dim1_edge1); + XAI_TILE3D_SET_DATA_ORDER(tile, XAI_WHD); + XAI_TILE3D_SET_TYPE(tile, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(tile, 0); + XAI_TILE3D_SET_STATUS_FLAGS(tile, 0); + + // Set dimensions + XAI_TILE3D_SET_DIM1(tile, dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(tile, dim1_edge1); + XAI_TILE3D_SET_DIM1_EDGE2(tile, dim1_edge2); + XAI_TILE3D_SET_DIM1_PITCH(tile, dim1_pitch); + XAI_TILE3D_SET_DIM1_COORD(tile, 0); + + XAI_TILE3D_SET_DIM2(tile, dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(tile, dim2_edge1); + XAI_TILE3D_SET_DIM2_EDGE2(tile, dim2_edge2); + XAI_TILE3D_SET_DIM2_PITCH(tile, dim2_pitch); + XAI_TILE3D_SET_DIM2_COORD(tile, 0); + + XAI_TILE3D_SET_DIM3(tile, dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(tile, dim3_edge1); + XAI_TILE3D_SET_DIM3_EDGE2(tile, dim3_edge2); + XAI_TILE3D_SET_DIM3_COORD(tile, 0); +} + +/** + * @brief Setup source tile descriptor for raw input data (before copy) + * + * Used to describe the source input data in system memory before + * copying to the padded SRAM tile. + */ +static inline void setup_tile3d_source( + xai_tile3D* tile, + int8_t* buffer, + int dim1_size, + int dim2_size, + int dim3_size, + int dim1_pitch, + int dim2_pitch +) { + XAI_TILE3D_SET_BUFF_PTR(tile, buffer); + XAI_TILE3D_SET_BUFF_SIZE(tile, dim2_pitch * dim3_size); + XAI_TILE3D_SET_DATA_PTR(tile, buffer); + XAI_TILE3D_SET_DATA_ORDER(tile, XAI_WHD); + XAI_TILE3D_SET_TYPE(tile, XAI_TILE3D_S8); + XAI_TILE3D_SET_FRAME_PTR(tile, 0); + XAI_TILE3D_SET_STATUS_FLAGS(tile, 0); + + XAI_TILE3D_SET_DIM1(tile, dim1_size); + XAI_TILE3D_SET_DIM1_EDGE1(tile, 0); + XAI_TILE3D_SET_DIM1_EDGE2(tile, 0); + XAI_TILE3D_SET_DIM1_PITCH(tile, dim1_pitch); + XAI_TILE3D_SET_DIM1_COORD(tile, 0); + + XAI_TILE3D_SET_DIM2(tile, dim2_size); + XAI_TILE3D_SET_DIM2_EDGE1(tile, 0); + XAI_TILE3D_SET_DIM2_EDGE2(tile, 0); + XAI_TILE3D_SET_DIM2_PITCH(tile, dim2_pitch); + XAI_TILE3D_SET_DIM2_COORD(tile, 0); + + XAI_TILE3D_SET_DIM3(tile, dim3_size); + XAI_TILE3D_SET_DIM3_EDGE1(tile, 0); + XAI_TILE3D_SET_DIM3_EDGE2(tile, 0); + XAI_TILE3D_SET_DIM3_COORD(tile, 0); +} + +#endif /* UTILS_H_ */ diff --git a/backends/cadence/vision/third-party/include_private/common.h b/backends/cadence/vision/third-party/include_private/common.h index 4fc07d8b4d1..e80e5e3775a 100644 --- a/backends/cadence/vision/third-party/include_private/common.h +++ b/backends/cadence/vision/third-party/include_private/common.h @@ -33,19 +33,10 @@ #include #include #include -#if XCHAL_HAVE_IDMA -#ifndef IDMA_USE_MULTICHANNEL - #define IDMA_USE_MULTICHANNEL 1 -#endif -#include -#endif #define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH #include "xtensa/config/core-isa.h" #include "xtensa/tie/xt_ivpn.h" -#if XCHAL_HAVE_IDMA -#include "xtensa/idma.h" -#endif #ifdef _MSC_VER #define ALIGN(x) _declspec(align(x)) @@ -70,16 +61,6 @@ #define restrict_clang #endif -// Performance measurement macros -#define XTPERF_PRINTF(...) printf(__VA_ARGS__) -#define TIME_DECL(test) long start_time_##test, end_time_##test; -#define TIME_START(test) { start_time_##test = 0; XT_WSR_CCOUNT(0); } -#define TIME_END(test) { end_time_##test = XT_RSR_CCOUNT(); } -#define TIME_DISPLAY(test, opcnt, opname) { long long cycles_##test = end_time_##test - start_time_##test; \ - XTPERF_PRINTF("PERF_LOG : %s : %d : %s : %lld : cycles : %.2f : %s/cycle : %.2f : cycles/%s\n", \ - #test, opcnt, opname, cycles_##test, cycles_##test == 0 ? 0 : (double)(opcnt)/cycles_##test, \ - opname, cycles_##test == 0 ? 0 : 1/((double)(opcnt)/cycles_##test), opname); } - //----------------------------------------------------- // log2(BBE_SIMD_WIDTH) //----------------------------------------------------- @@ -190,6 +171,21 @@ #define HAVE_32X32 0 #endif +/*------ INSTRUCTION EMULATIONS ------*/ + +#ifndef IVP_ADDSN_2X32 +#define IVP_ADDSN_2X32(b_, c_) \ + ({ \ + xb_vecN_2x32v a_; \ + xb_vecN_2x64w tmp_a_; \ + tmp_a_ = IVP_MULN_2X32(b_, 1); \ + IVP_MULAN_2X32(tmp_a_, c_, 1); \ + a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \ + a_; \ + }) +#endif + + #ifdef __cplusplus #define externC extern "C" #else diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h deleted file mode 100644 index a885bdf6086..00000000000 --- a/backends/cadence/vision/third-party/include_private/idma_init.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef __IDMA__INIT_H__ -#define __IDMA__INIT_H__ - -#include "../include/dtypes.h" -#include "common.h" - - // 4 kb x sizeof(float32_t) = 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output) -#define IDMA_BUFF_SIZE 4096 - -#ifndef PLACE_IN_DRAM0 -#define PLACE_IN_DRAM0 \ - __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data"))) -#endif - -#ifndef PLACE_IN_DRAM1 -#define PLACE_IN_DRAM1 \ - __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data"))) -#endif - -float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0; -float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1; - -float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]}; -float32_t* outData[2] = { - &data_dram0[IDMA_BUFF_SIZE / 4], - &data_dram1[IDMA_BUFF_SIZE / 4]}; - -IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC); -IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC); - -idma_buffer_t* descbuf[] = { - buffer_idma_ch0, - buffer_idma_ch1, -}; - -#endif // __IDMA__INIT_H__ diff --git a/backends/cadence/vision/third-party/library/api/dequantize.c b/backends/cadence/vision/third-party/library/api/dequantize.c new file mode 100644 index 00000000000..98b707887bd --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/dequantize.c @@ -0,0 +1,81 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ + +#include "api.h" +#include "common.h" + +#if !HAVE_VFPU +DISCARD_FUN(void, dequantize_asym8s_f32, (float32_t *restrict ptr_out + ,const int8_t *restrict ptr_inp + ,float32_t scale + ,int zero_bias + ,int N)) +#else +void dequantize_asym8s_f32(float32_t *restrict ptr_out + ,const int8_t *restrict ptr_inp + ,float32_t scale + ,int zero_bias + ,int N) +{ + // Inputs + xb_vecNx8 *p_i = (xb_vecNx8 *)ptr_inp; + xb_vecN_2xf32 *p_o = (xb_vecN_2xf32 *)ptr_out; + + // Loop index + int n; + + // Alignment variables + valign al_i = IVP_LANX8S_PP(p_i); + valign al_o = IVP_ZALIGN(); + + for (n = 0; n < (N >> LOG2_IVP_SIMD_WIDTH); n++) + { + xb_vecNx16 inp; + xb_vecN_2x32v inp1_bias, inp2_bias; + xb_vecN_2xf32 out1, out2; + + IVP_LANX8S_XP(inp, al_i, p_i, IVP_SIMD_WIDTH); + + inp1_bias = IVP_UNPKSNX16_L(inp); + inp2_bias = IVP_UNPKSNX16_H(inp); + + inp1_bias = IVP_SUBN_2X32(inp1_bias, (xb_vecN_2x32v) zero_bias); + out1 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp1_bias); + + inp2_bias = IVP_SUBN_2X32(inp2_bias, (xb_vecN_2x32v) zero_bias); + out2 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp2_bias); + + IVP_SAN_2XF32_IP(out1, al_o, p_o); + IVP_SAN_2XF32_IP(out2, al_o, p_o); + } + if (N & (IVP_SIMD_WIDTH - 1)) // Check if there are remaining elements + { + xb_vecNx16 inp; + xb_vecN_2x32v inp1_bias, inp2_bias; + xb_vecN_2xf32 out1, out2; + + IVP_LANX8S_XP(inp, al_i, p_i, N & (IVP_SIMD_WIDTH - 1)); + + inp1_bias = IVP_UNPKSNX16_L(inp); + inp2_bias = IVP_UNPKSNX16_H(inp); + + inp1_bias = IVP_SUBN_2X32(inp1_bias, (xb_vecN_2x32v) zero_bias); + out1 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp1_bias); + + inp2_bias = IVP_SUBN_2X32(inp2_bias, (xb_vecN_2x32v) zero_bias); + out2 = IVP_MULN_2XF32(scale, (xb_vecN_2xf32) inp2_bias); + + IVP_SAVN_2XF32_XP(out1, al_o, p_o, 4 * (N & (IVP_SIMD_WIDTH - 1))); + IVP_SAVN_2XF32_XP(out2, al_o, p_o, 4 * ((N & (IVP_SIMD_WIDTH - 1)) - (IVP_SIMD_WIDTH >> 1))); + } + IVP_SAPOSN_2XF32_FP(al_o, p_o); +} +#endif \ No newline at end of file diff --git a/backends/cadence/vision/third-party/library/api/maxpool2df.c b/backends/cadence/vision/third-party/library/api/maxpool2df.c new file mode 100644 index 00000000000..76faac55ea6 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/maxpool2df.c @@ -0,0 +1,248 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ + +#include "api.h" +#include "common.h" + +#if !HAVE_VFPU +DISCARD_FUN(void, maxpool2d_with_indices_j2x2_f32, (float32_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,int *restrict ptr_indices + ,int inp_height ,int inp_width + ,int out_height ,int out_width + ,int32_t in_pitch_width, int32_t in_pitch_height + ,int32_t out_pitch_width, int32_t out_pitch_height + ,uint8_t kernel_height + ,uint8_t kernel_width)) + +DISCARD_FUN(void, maxpool2d_j2x2_f32, (float32_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,int inp_height ,int inp_width + ,int out_height ,int out_width + ,int32_t in_pitch_width, int32_t in_pitch_height + ,int32_t out_pitch_width, int32_t out_pitch_height + ,uint8_t kernel_height + ,uint8_t kernel_width)) +#else +void maxpool2d_with_indices_j2x2_f32(float32_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,int *restrict ptr_indices + ,int inp_height ,int inp_width + ,int out_height ,int out_width + ,int32_t in_pitch_width, int32_t in_pitch_height + ,int32_t out_pitch_width, int32_t out_pitch_height + ,uint8_t kernel_height + ,uint8_t kernel_width) +{ + const int32_t out_increment = ((IVP_SIMD_WIDTH - kernel_width) / 2) + 1; + + int32_t x, y, kx, ky; + int32_t remX, remXLoad; + + xb_vecN_2xf32* restrict pdvecOut; + xb_vecN_2x32v* restrict pdvecIdx; + xb_vecN_2xf32* restrict pdvecIn; + + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2xf32 dvecMax1; + xb_vecN_2xf32 dvecMax11, dvecMax12; + xb_vecN_2xf32 dvecData11, dvecData12; + xb_vecN_2x32v dvecKxIdx1, dvecKyIdx1; + xb_vecN_2x32v dvecKyIdx11, dvecKyIdx12; + xb_vecN_2x32v dvecIdx1; + + vboolN_2 dboolGT, dboolEq; + vboolN_2 dboolkyIdxLT; + xb_vecN_2x32v dvecGTKyIdx, dvecEQKyIdx; + xb_vecN_2x32v dvecGTKxIdx, dvecEQKxIdx; + + vboolN_2 dvbKernelType = IVP_EQN_2X32((kernel_width % 2), 0); + + for (x = 0; x < out_width; x += out_increment) { + remX = XT_MIN(out_width - x, out_increment); + remXLoad = ((2 * (remX - 1) + kernel_width) > (IVP_SIMD_WIDTH / 2)) ? 1 : 0; + int32_t remXOffset = remXLoad * (IVP_SIMD_WIDTH / 2); + + for (y = 0; y < out_height; y++) { + float* pOut = &ptr_out[y * out_pitch_width + x]; + int32_t* pIdx = &ptr_indices[y * out_pitch_width + x]; + const float* pSrc = ptr_inp + y * in_pitch_width * 2 + x * 2; + pdvecIn = (xb_vecN_2xf32*) pSrc; + + // Initialize max values + dvecMax1 = MIN_FLT32; + dvecMax11 = dvecMax12 = dvecMax1; + + // Initialize index tracking + dvecKxIdx1 = 0; + dvecKyIdx1 = 0; + dvecKyIdx11 = dvecKyIdx12 = 0; + + // ========== KERNEL HEIGHT COMPARISONS ========== + for (ky = 0; ky < kernel_height; ky++) { + IVP_L2UN_2XF32_XP(dvecData11, pdvecIn, remXOffset * sizeof(float)); + IVP_L2UN_2XF32_XP(dvecData12, pdvecIn, (in_pitch_width - remXOffset) * sizeof(float)); + + dboolGT = IVP_OGTN_2XF32(dvecData11, dvecMax11); + dvecMax11 = IVP_MAXN_2XF32(dvecMax11, dvecData11); + dvecKyIdx11 = IVP_MOVN_2X32T(ky, dvecKyIdx11, dboolGT); + + dboolGT = IVP_OGTN_2XF32(dvecData12, dvecMax12); + dvecMax12 = IVP_MAXN_2XF32(dvecMax12, dvecData12); + dvecKyIdx12 = IVP_MOVN_2X32T(ky, dvecKyIdx12, dboolGT); + } + + IVP_DSELN_2XF32I(dvecMax12, dvecMax11, dvecMax12, dvecMax11, IVP_DSELI_32B_DEINTERLEAVE_1); + IVP_DSELN_2X32I(dvecKyIdx12, dvecKyIdx11, dvecKyIdx12, dvecKyIdx11, IVP_DSELI_32B_DEINTERLEAVE_1); + + // ========== KERNEL WIDTH COMPARISONS ========== + for (kx = 0; kx < kernel_width - 1; kx += 2) { + // First comparison + dboolEq = IVP_OEQN_2XF32(dvecMax11, dvecMax1); + dboolGT = IVP_OGTN_2XF32(dvecMax11, dvecMax1); + dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax11); + + dvecGTKyIdx = IVP_MOVN_2X32T(dvecKyIdx11, dvecKyIdx1, dboolGT); + dvecEQKyIdx = IVP_MOVN_2X32T(dvecKyIdx11, dvecKyIdx1, dboolEq); + dvecKyIdx1 = IVP_MOVN_2X32T(IVP_MINN_2X32(dvecGTKyIdx, dvecEQKyIdx), dvecGTKyIdx, dboolEq); + + dvecGTKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolGT); + dvecEQKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolEq); + dboolkyIdxLT = IVP_LTN_2X32(dvecKyIdx1, dvecGTKyIdx); + dvecKxIdx1 = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecEQKxIdx, dvecGTKxIdx, dboolkyIdxLT), dvecGTKxIdx, dboolEq); + + dvecMax11 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax11, IVP_SELI_32B_ROTATE_RIGHT_1); + dvecKyIdx11 = IVP_SELN_2X32I(0, dvecKyIdx11, IVP_SELI_32B_ROTATE_RIGHT_1); + + // Second comparison + dboolEq = IVP_OEQN_2XF32(dvecMax12, dvecMax1); + dboolGT = IVP_OGTN_2XF32(dvecMax12, dvecMax1); + dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax12); + + dvecGTKyIdx = IVP_MOVN_2X32T(dvecKyIdx12, dvecKyIdx1, dboolGT); + dvecEQKyIdx = IVP_MOVN_2X32T(dvecKyIdx12, dvecKyIdx1, dboolEq); + dvecKyIdx1 = IVP_MOVN_2X32T(IVP_MINN_2X32(dvecGTKyIdx, dvecEQKyIdx), dvecGTKyIdx, dboolEq); + + dvecGTKxIdx = IVP_MOVN_2X32T((kx + 1), dvecKxIdx1, dboolGT); + dvecEQKxIdx = IVP_MOVN_2X32T((kx + 1), dvecKxIdx1, dboolEq); + dboolkyIdxLT = IVP_LTN_2X32(dvecKyIdx1, dvecGTKyIdx); + dvecKxIdx1 = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecEQKxIdx, dvecGTKxIdx, dboolkyIdxLT), dvecGTKxIdx, dboolEq); + + dvecMax12 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax12, IVP_SELI_32B_ROTATE_RIGHT_1); + dvecKyIdx12 = IVP_SELN_2X32I(0, dvecKyIdx12, IVP_SELI_32B_ROTATE_RIGHT_1); + } + + // final comparison if kernel_width is odd + xb_vecN_2xf32 dvecMaxTest = IVP_MOVN_2XF32T(dvecMax1, dvecMax11, dvbKernelType); + + dboolEq = IVP_OEQN_2XF32(dvecMaxTest, dvecMax1); + dboolGT = IVP_OGTN_2XF32(dvecMaxTest, dvecMax1); + dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMaxTest); + + dvecGTKyIdx = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecKyIdx1, dvecKyIdx11, dvbKernelType), dvecKyIdx1, dboolGT); + dvecEQKyIdx = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecKyIdx1, dvecKyIdx11, dvbKernelType), dvecKyIdx1, dboolEq); + dvecKyIdx1 = IVP_MOVN_2X32T(IVP_MINN_2X32(dvecGTKyIdx, dvecEQKyIdx), dvecGTKyIdx, dboolEq); + + dvecGTKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolGT); + dvecEQKxIdx = IVP_MOVN_2X32T(kx, dvecKxIdx1, dboolEq); + dboolkyIdxLT = IVP_LTN_2X32(dvecKyIdx1, dvecGTKyIdx); + dvecKxIdx1 = IVP_MOVN_2X32T(IVP_MOVN_2X32T(dvecEQKxIdx, dvecGTKxIdx, dboolkyIdxLT), dvecGTKxIdx, dboolEq); + + dvecIdx1 = IVP_ORN_2X32(IVP_SLLIN_2X32(dvecKyIdx1, 4), dvecKxIdx1); + + // ========== STORE OUTPUTS ========== + // Store max values + pdvecOut = (xb_vecN_2xf32*) pOut; + IVP_SAVN_2XF32_XP(dvecMax1, vaOutData, pdvecOut, remX * sizeof(float)); + IVP_SAPOSN_2XF32_FP(vaOutData, pdvecOut); + + // Store indices + pdvecIdx = (xb_vecN_2x32v*) pIdx; + IVP_SAVN_2X32_XP(dvecIdx1, vaOutData, pdvecIdx, remX * sizeof(int32_t)); + IVP_SAPOSN_2X32_FP(vaOutData, pdvecIdx); + } + } +} + +void maxpool2d_j2x2_f32(float32_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,int inp_height ,int inp_width + ,int out_height ,int out_width + ,int32_t in_pitch_width, int32_t in_pitch_height + ,int32_t out_pitch_width, int32_t out_pitch_height + ,uint8_t kernel_height + ,uint8_t kernel_width) +{ + const int32_t out_increment = ((IVP_SIMD_WIDTH - kernel_width) / 2) + 1; + int32_t x, y, kx, ky; + int32_t remX, remXLoad; + + xb_vecN_2xf32* restrict pdvecOut; + xb_vecN_2xf32* restrict pdvecIn; + + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2xf32 dvecMax1; + xb_vecN_2xf32 dvecMax11, dvecMax12; + xb_vecN_2xf32 dvecData11, dvecData12; + + vboolN_2 dvbKernelType = IVP_EQN_2X32((kernel_width % 2), 0); + + for (x = 0; x < out_width; x += out_increment) { + remX = XT_MIN(out_width - x, out_increment); + remXLoad = ((2 * (remX - 1) + kernel_width) > (IVP_SIMD_WIDTH / 2)) ? 1 : 0; + int32_t remXOffset = remXLoad * (IVP_SIMD_WIDTH / 2); + + for (y = 0; y < out_height; y++) { + float* pOut = &ptr_out[y * out_pitch_width + x]; + const float* pSrc = ptr_inp + y * in_pitch_width * 2 + x * 2; + pdvecIn = (xb_vecN_2xf32*) pSrc; + + // Initialize max values + dvecMax1 = MIN_FLT32; + dvecMax11 = dvecMax12 = dvecMax1; + + // ========== KERNEL HEIGHT COMPARISONS ========== + for (ky = 0; ky < kernel_height; ky++) { + IVP_L2UN_2XF32_XP(dvecData11, pdvecIn, remXOffset * sizeof(float)); + IVP_L2UN_2XF32_XP(dvecData12, pdvecIn, (in_pitch_width - remXOffset) * sizeof(float)); + + dvecMax11 = IVP_MAXN_2XF32(dvecMax11, dvecData11); + dvecMax12 = IVP_MAXN_2XF32(dvecMax12, dvecData12); + } + + IVP_DSELN_2XF32I(dvecMax12, dvecMax11, dvecMax12, dvecMax11, IVP_DSELI_32B_DEINTERLEAVE_1); + + // ========== KERNEL WIDTH COMPARISONS ========== + for (kx = 0; kx < kernel_width - 1; kx += 2) { + // First comparison + dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax11); + dvecMax11 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax11, IVP_SELI_32B_ROTATE_RIGHT_1); + + // Second comparison + dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMax12); + dvecMax12 = IVP_SELN_2XF32I(MIN_FLT32, dvecMax12, IVP_SELI_32B_ROTATE_RIGHT_1); + } + + // final comparison if kernel_width is odd + xb_vecN_2xf32 dvecMaxTest = IVP_MOVN_2XF32T(dvecMax1, dvecMax11, dvbKernelType); + dvecMax1 = IVP_MAXN_2XF32(dvecMax1, dvecMaxTest); + + // ========== STORE OUTPUTS ========== + // Store max values + pdvecOut = (xb_vecN_2xf32*) pOut; + IVP_SAVN_2XF32_XP(dvecMax1, vaOutData, pdvecOut, remX * sizeof(float)); + IVP_SAPOSN_2XF32_FP(vaOutData, pdvecOut); + } + } +} +#endif /* HAVE_VFPU */ diff --git a/backends/cadence/vision/third-party/library/api/mean.c b/backends/cadence/vision/third-party/library/api/mean.c new file mode 100644 index 00000000000..01c528333c8 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/mean.c @@ -0,0 +1,110 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ + +/*------------------------------------------------------------------------- + SIMD Mean Pooling Operations + + This module implements optimized mean pooling operations using Xtensa + Vision DSP SIMD intrinsics for float32 data. +-------------------------------------------------------------------------*/ + +#include +#include +#include "api.h" +#include "common.h" +typedef float float32_t; + +#ifndef IVP_SIMD_WIDTH +#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH +#endif + + + +/*------------------------------------------------------------------------- + SIMD Mean Pooling 2x2 -> 1x1 + + Description: + This function implements mean pooling across 2x2 spatial dimensions for + float32 data using Xtensa SIMD intrinsics. + + Input shape: 1 x C x 2 x 2 (batch=1, channels=C, height=2, width=2) + Output shape: 1 x C x 1 x 1 (batch=1, channels=C, height=1, width=1) + + Algorithm: + - Load 16 float32 elements at a time (4 channels x 2x2 spatial) in ONE vector + - For each channel, compute mean of 4 spatial values (2x2) + - Use SIMD vector operations for efficient computation + + With SIMD width N=32, xb_vecN_2xf32 holds 16 float32 values. + Single load gets all 16 values: ch0[0,0], ch0[0,1], ch0[1,0], ch0[1,1], + ch1[0,0], ch1[0,1], ch1[1,0], ch1[1,1], ch2[0,0], ch2[0,1], ch2[1,0], ch2[1,1], + ch3[0,0], ch3[0,1], ch3[1,0], ch3[1,1] + + Then shuffle to group elements from same channel together, + sum them, and divide by 4 to get the mean. + + Parameters: + Input: + input[num_channels*4] Input tensor in CHW format (channels, 2x2 spatial) + num_channels Number of input channels + Output: + output[num_channels] Output tensor (channels, 1x1 spatial) + + Restrictions: + - num_channels must be a multiple of 4 + - input and output must be aligned to 64-byte boundary + - input and output must not overlap + +-------------------------------------------------------------------------*/ +void simd_mean_pool_2x2_to_1x1_float32(float32_t* restrict output, + const float32_t* restrict input, + int N) +{ + int n; + xb_vecN_2xf32 vec0, vec1, vec2, vec3; + xb_vecN_2xf32 vec0_0, vec0_1, vec1_0, vec1_1; + xb_vecN_2xf32 v0, v1, v2, v3, sum_all, result; + const xb_vecN_2xf32* restrict pInput = (const xb_vecN_2xf32*)input; + xb_vecN_2xf32* restrict pOutput = (xb_vecN_2xf32*)output; + + if (N <= 0) return; + + __Pragma("no_reorder"); + // __Pragma("loop_count min=1"); + + for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH + 1)); n++) { + // Load 64 float32 values (4 vectors) - 16 channels × 4 values each + IVP_LVN_2XF32_IP(vec0, pInput, 2 * IVP_SIMD_WIDTH); // 0-15 + IVP_LVN_2XF32_IP(vec1, pInput, 2 * IVP_SIMD_WIDTH); // 16-31 + IVP_LVN_2XF32_IP(vec2, pInput, 2 * IVP_SIMD_WIDTH); // 32-47 + IVP_LVN_2XF32_IP(vec3, pInput, 2 * IVP_SIMD_WIDTH); // 48-63 + + // First level: Deinterleave vec0-vec1 and vec2-vec3 pairs (independent) + IVP_DSELN_2XF32I(vec0_0, vec0_1, vec1, vec0, IVP_DSELI_DEINTERLEAVE_2); + IVP_DSELN_2XF32I(vec1_0, vec1_1, vec3, vec2, IVP_DSELI_DEINTERLEAVE_2); + + // Second level: Cross-deinterleave directly to final vectors + IVP_DSELN_2XF32I(v2, v0, vec1_0, vec0_0, IVP_DSELI_DEINTERLEAVE_2); + IVP_DSELN_2XF32I(v3, v1, vec1_1, vec0_1, IVP_DSELI_DEINTERLEAVE_2); + + // v0=vec3_1 (stride-4, mod 0), v1=vec2_1 (stride-4, mod 1) + // v2=vec3_0 (stride-4, mod 2), v3=vec2_0 (stride-4, mod 3) + + // Fused add: ((v0 + v1) + (v2 + v3)) for better pipelining + sum_all = IVP_ADDN_2XF32(IVP_ADDN_2XF32(v0, v1), IVP_ADDN_2XF32(v2, v3)); + + // Multiply by 0.25 to get mean + result = IVP_MULN_2XF32(sum_all, 0.25f); + + // Store result + IVP_SVN_2XF32_IP(result, pOutput, 2 * IVP_SIMD_WIDTH); + } +} diff --git a/backends/cadence/vision/third-party/library/api/quanitze_relu.c b/backends/cadence/vision/third-party/library/api/quanitze_relu.c new file mode 100644 index 00000000000..97b0a19a654 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/quanitze_relu.c @@ -0,0 +1,112 @@ + +#include "api.h" +#include "common.h" +#include + +void vrelU_quantized( + int8_t* restrict ptr_out, + const int8_t* restrict ptr_inp, + int32_t in_zero_point, + int32_t out_zero_point, + float32_t out_scale, + int N) +{ + // Pointers + xb_vecNx8 *p_i = (xb_vecNx8 *)ptr_inp; + xb_vecNx8 *p_o = (xb_vecNx8 *)ptr_out; + + // Loop index + int n; + + // Alignment variables + valign al_i = IVP_LANX8S_PP(p_i); + valign al_o = IVP_ZALIGN(); + + // Constants + xb_vecN_2x32v zero_vec = 0; + xb_vecN_2x32v in_zp_vec = (xb_vecN_2x32v)in_zero_point; + xb_vecN_2xf32 out_zp_f32 = (xb_vecN_2xf32)(float32_t)out_zero_point; + xb_vecN_2xf32 min_val = (xb_vecN_2xf32)(-128.0f); + xb_vecN_2xf32 max_val = (xb_vecN_2xf32)(127.0f); + + for (n = 0; n < (N >> LOG2_IVP_SIMD_WIDTH); n++) + { + xb_vecNx16 inp; + xb_vecN_2x32v temp1, temp2; + xb_vecN_2xf32 float1, float2; + xb_vecN_2xf32 result1, result2; + xb_vecNx16 out; + + // Load int8 → sign-extend to 16-bit + IVP_LANX8S_XP(inp, al_i, p_i, IVP_SIMD_WIDTH); + + // Unpack 16-bit → two 32-bit vectors (16 elements each) + temp1 = IVP_UNPKSNX16_L(inp); + temp2 = IVP_UNPKSNX16_H(inp); + + // Integer operations: SUB in_zero_point + temp1 = IVP_SUBN_2X32(temp1, in_zp_vec); + temp2 = IVP_SUBN_2X32(temp2, in_zp_vec); + + // ReLU: MAX(temp, 0) + temp1 = IVP_MAXN_2X32(temp1, zero_vec); + temp2 = IVP_MAXN_2X32(temp2, zero_vec); + + // Convert int32 → float32 (implicit cast) + float1 = (xb_vecN_2xf32)temp1; + float2 = (xb_vecN_2xf32)temp2; + + // FMA: out_zero_point + temp * out_scale + result1 = out_zp_f32; + IVP_MULAN_2XF32(result1, float1, out_scale); + result2 = out_zp_f32; + IVP_MULAN_2XF32(result2, float2, out_scale); + + // Clamp to [-128, 127] and round to nearest integer + result1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result1, max_val), min_val)); + result2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result2, max_val), min_val)); + + // Pack float → int16 → int8 (no explicit conversion needed) + out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(result2, result1, IVP_SELI_EXTRACT_1_OF_2_OFF_0)); + IVP_SANX8S_IP(out, al_o, p_o); + } + + // Handle remaining elements (tail) + if (N & (IVP_SIMD_WIDTH - 1)) + { + xb_vecNx16 inp; + xb_vecN_2x32v temp1, temp2; + xb_vecN_2xf32 float1, float2; + xb_vecN_2xf32 result1, result2; + xb_vecNx16 out; + + IVP_LANX8S_XP(inp, al_i, p_i, N & (IVP_SIMD_WIDTH - 1)); + + temp1 = IVP_UNPKSNX16_L(inp); + temp2 = IVP_UNPKSNX16_H(inp); + + temp1 = IVP_SUBN_2X32(temp1, in_zp_vec); + temp2 = IVP_SUBN_2X32(temp2, in_zp_vec); + + temp1 = IVP_MAXN_2X32(temp1, zero_vec); + temp2 = IVP_MAXN_2X32(temp2, zero_vec); + + float1 = (xb_vecN_2xf32)temp1; + float2 = (xb_vecN_2xf32)temp2; + + result1 = out_zp_f32; + IVP_MULAN_2XF32(result1, float1, out_scale); + result2 = out_zp_f32; + IVP_MULAN_2XF32(result2, float2, out_scale); + + // Clamp to [-128, 127] and round to nearest integer + result1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result1, max_val), min_val)); + result2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(result2, max_val), min_val)); + + // Pack float → int16 → int8 (no explicit conversion needed) + out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(result2, result1, IVP_SELI_EXTRACT_1_OF_2_OFF_0)); + IVP_SAVNX8S_XP(out, al_o, p_o, (N & (IVP_SIMD_WIDTH - 1))); + } + + IVP_SAPOSNX8S_FP(al_o, p_o); +} diff --git a/backends/cadence/vision/third-party/library/api/quantizef.c b/backends/cadence/vision/third-party/library/api/quantizef.c new file mode 100644 index 00000000000..7803a812f84 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/quantizef.c @@ -0,0 +1,79 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ + +#include "api.h" +#include "common.h" + +#if !HAVE_VFPU +DISCARD_FUN(void, quantize_f32_asym8s, (int8_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,float32_t scale + ,int zero_bias + ,int N)) +#else +void quantize_f32_asym8s(int8_t *restrict ptr_out + ,const float32_t *restrict ptr_inp + ,float32_t scale + ,int zero_bias + ,int N) +{ + // Inputs + xb_vecN_2xf32 *p_i = (xb_vecN_2xf32 *)ptr_inp; + xb_vecNx8 *p_o = (xb_vecNx8 *)ptr_out; + float32_t one_by_scaleF = (float32_t) (1.0f / scale); + float32_t one_by_scale = (one_by_scaleF > (float32_t) MAX_FLT32 ? (float32_t) MAX_FLT32 : (float32_t) (1.0f / scale)); + + // Loop index + int n; + + // Alignment variables + valign al_i = IVP_LAN_2XF32_PP(p_i); + valign al_o = IVP_ZALIGN(); + + for (n = 0; n < (N >> LOG2_IVP_SIMD_WIDTH); n++) + { + xb_vecN_2xf32 inp1, inp2; + xb_vecN_2xf32 inp1_scaled, inp2_scaled; + xb_vecN_2xf32 out1, out2; + xb_vecNx16 out; + + IVP_LAN_2XF32_IP(inp1, al_i, p_i); + IVP_LAN_2XF32_IP(inp2, al_i, p_i); + inp1_scaled = (float32_t) zero_bias; + IVP_MULAN_2XF32(inp1_scaled, inp1, one_by_scale); + inp2_scaled = (float32_t) zero_bias; + IVP_MULAN_2XF32(inp2_scaled, inp2, one_by_scale); + out1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp1_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8)); + out2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp2_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8)); + out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(out2, out1, IVP_SELI_EXTRACT_1_OF_2_OFF_0)); + IVP_SANX8S_IP(out, al_o, p_o); + } + if (N & (IVP_SIMD_WIDTH - 1)) // Check if there are remaining elements + { + xb_vecN_2xf32 inp1, inp2; + xb_vecN_2xf32 inp1_scaled, inp2_scaled; + xb_vecN_2xf32 out1, out2; + xb_vecNx16 out; + + IVP_LAVN_2XF32_XP(inp1, al_i, p_i, 4 * (N & (IVP_SIMD_WIDTH - 1))); + IVP_LAVN_2XF32_XP(inp2, al_i, p_i, 4 * ((N & (IVP_SIMD_WIDTH - 1)) - (IVP_SIMD_WIDTH >> 1))); + inp1_scaled = (float32_t) zero_bias; + IVP_MULAN_2XF32(inp1_scaled, inp1, one_by_scale); + inp2_scaled = (float32_t) zero_bias; + IVP_MULAN_2XF32(inp2_scaled, inp2, one_by_scale); + out1 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp1_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8)); + out2 = IVP_FIRINTN_2XF32(IVP_MAXN_2XF32(IVP_MINN_2XF32(inp2_scaled, (xb_vecN_2xf32) MAX_INT8), (xb_vecN_2xf32) MIN_INT8)); + out = IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(out2, out1, IVP_SELI_EXTRACT_1_OF_2_OFF_0)); + IVP_SAVNX8S_XP(out, al_o, p_o, (N & (IVP_SIMD_WIDTH - 1))); + } + IVP_SAPOSNX8S_FP(al_o, p_o); +} +#endif \ No newline at end of file diff --git a/backends/cadence/vision/third-party/library/api/vaddf.c b/backends/cadence/vision/third-party/library/api/vaddf.c new file mode 100644 index 00000000000..2e64703a194 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/vaddf.c @@ -0,0 +1,124 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* NatureDSP_Baseband Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2009-2022 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NatureDSP_Baseband library. Vector Operations + Real Vectors Sum +*/ + +/* Cross-platform data type definitions. */ +/* Common helper macros. */ +#include "api.h" +#include "common.h" +#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH +/* Vector Operations. */ + + +/*------------------------------------------------------------------------- +Real Vectors Sum + +Description: These routines perform pairwise summation of real vectors. + +Representation: +rvadd Signed fixed-point format. 16-bit inputs, 16-bit saturated results +rvadd_32b Signed fixed-point format. 32-bit inputs, 32-bit saturated results +rvadd_fp16 IEEE-754 Std. half precision floating-point format for + input/output data +rvaddf IEEE-754 Std. single precision floating-point format for + input/output data +rvadd_f64 IEEE-754 Std. double precision floating-point format for + input/output data + +Parameters: +Input: +x[N] Input vector +y[N] Input vector +N Length of vectors +Output: +z[N] Sum of input vectirs + +Restrictions: +z,x,y Must not overlap +z,x,y Aligned on 2*BBE_SIMD_WIDTH-byte boundary +N Multiple of BBE_SIMD_WIDTH (rvadd,rvadd_fp16) + Multiple of BBE_SIMD_WIDTH/2 (rvadd_32b, rvaddf) + Multiple of BBE_SIMD_WIDTH/4 (rvadd_f64) +-------------------------------------------------------------------------*/ +void rvaddf(float32_t *restrict z, const float32_t *restrict x, + const float32_t *restrict y, int N) { +#if (1) + int n; + xb_vecN_2xf32 x0, y0, z0; + xb_vecN_2xf32 x1, y1, z1; + const xb_vecN_2xf32 *restrict pX = (const xb_vecN_2xf32 *)x; + const xb_vecN_2xf32 *restrict pY = (const xb_vecN_2xf32 *)y; + xb_vecN_2xf32 *restrict pZ = (xb_vecN_2xf32 *)z; + NASSERT_ALIGN(x, (2 * IVP_SIMD_WIDTH)); + NASSERT_ALIGN(y, (2 * IVP_SIMD_WIDTH)); + NASSERT_ALIGN(z, (2 * IVP_SIMD_WIDTH)); + NASSERT(N % (IVP_SIMD_WIDTH / 2) == 0); + if (N <= 0) + return; + __Pragma("no_reorder"); + __Pragma("no_reorder"); + + for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH-1)); n++) { + IVP_LVN_2XF32_IP(x0, pX, 2 * IVP_SIMD_WIDTH); + IVP_LVN_2XF32_IP(y0, pY, 2 * IVP_SIMD_WIDTH); + z0 = IVP_ADDN_2XF32(x0, y0); + IVP_SVN_2XF32_IP(z0, pZ, 2 * IVP_SIMD_WIDTH); + } + + if (N & ((IVP_SIMD_WIDTH>>1) - 1)) { + valign vx0 = IVP_LAN_2XF32_PP(pX); + valign vy0 = IVP_LAN_2XF32_PP(pY); + valign vz0 = IVP_ZALIGN(); + + IVP_LAVN_2XF32_XP(x0, vx0, pX, 2 * ((IVP_SIMD_WIDTH>>1) - 1)); + IVP_LAVN_2XF32_XP(y0, vy0, pY, 2 * ((IVP_SIMD_WIDTH>>1) - 1)); + z0 = IVP_ADDN_2XF32(x0, y0); + IVP_SAVN_2XF32_XP(z0, vz0, pZ, 2 * ((IVP_SIMD_WIDTH>>1) - 1)); + IVP_SAPOSN_2XF32_FP(vz0, pZ); + } +#else + int n; + xtfloat x0, y0, z0; + const xtfloat *restrict pX = (const xtfloat *)x; + const xtfloat *restrict pY = (const xtfloat *)y; + xtfloat *restrict pZ = (xtfloat *)z; + NASSERT_ALIGN(x, (2 * IVP_SIMD_WIDTH)); + NASSERT_ALIGN(y, (2 * IVP_SIMD_WIDTH)); + NASSERT_ALIGN(z, (2 * IVP_SIMD_WIDTH)); + NASSERT(N % (IVP_SIMD_WIDTH / 2) == 0); + if (N <= 0) + return; + + for (n = 0; n < (N); n++) { + XT_LSIP(x0, pX, sizeof(xtfloat)); + XT_LSIP(y0, pY, sizeof(xtfloat)); + z0 = XT_ADD_S(x0, y0); + XT_SSIP(z0, pZ, sizeof(xtfloat)); + } +#endif +} diff --git a/backends/cadence/vision/third-party/library/api/vdot_zeropt.c b/backends/cadence/vision/third-party/library/api/vdot_zeropt.c new file mode 100644 index 00000000000..9c0b25956d5 --- /dev/null +++ b/backends/cadence/vision/third-party/library/api/vdot_zeropt.c @@ -0,0 +1,123 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ('Cadence */ +/* Libraries') are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ + +#include +#include "api.h" +#include "common.h" + +// Macro to emulate reduction of N 32-bit elements from Nx48 +#define IVP_RADDNX32W_EMULATED(vecNx48) ({ \ + xb_vecN_2x32v q0 = IVP_CVT32SNX48H(vecNx48); \ + xb_vecN_2x32v q1 = IVP_CVT32SNX48L(vecNx48); \ + xb_int32v s0 = IVP_RADDN_2X32(q0); \ + xb_int32v s1 = IVP_RADDN_2X32(q1); \ + s0 + s1; \ +}) + +/*------------------------------------------------------------------------- + Vector Dot Product with Zero-Point Subtraction + + Description: This routine performs dot product of two quantized int8 vectors + with zero-point subtraction applied before multiplication: + result = init_acc + sum((x[i] - x_zp) * (y[i] - y_zp)) for i=0..N-1 + + This is commonly used in quantized neural network operations where + zero-point offset needs to be removed before computation. + + Representation: + rvdot_zeropt Signed fixed-point format. 8-bit inputs, 32-bit result + + Parameters: + Input: + init_acc Initial accumulator value (int32) + x[N] Input vector (int8) + y[N] Input vector (int8) + x_zp Zero-point for x vector (int8) + y_zp Zero-point for y vector (int8) + N Length of vectors + + Output: + Returns 32-bit accumulated dot product result + + Restrictions: + x,y Aligned on 2*BBE_SIMD_WIDTH-byte boundary preferred + N Any positive value (tail handling included) +-------------------------------------------------------------------------*/ +int32_t rvdot_zeropt( + int32_t init_acc, + const int8_t *restrict x, + const int8_t *restrict y, + int8_t x_zp, + int8_t y_zp, + int N) { + + const xb_vecNx8 *restrict pX = (const xb_vecNx8 *)x; + const xb_vecNx8 *restrict pY = (const xb_vecNx8 *)y; + + xb_vecNx48 acc = 0; // Initialize accumulator to zero + xb_vecNx16 vx, vy; + xb_vecNx16 vx_shifted, vy_shifted; + + int k; + + if (N <= 0) + return init_acc; + + // Process in chunks of IVP_SIMD_WIDTH (typically 32 elements) using Nx16 + for (k = 0; k < (N >> LOG2_IVP_SIMD_WIDTH); k++) { + // Load vectors as Nx8 with sign-extension to Nx16 (loads N int8 elements) + IVP_LVNX8S_IP(vx, pX, IVP_SIMD_WIDTH); + IVP_LVNX8S_IP(vy, pY, IVP_SIMD_WIDTH); + + // Subtract zero-points in 16-bit: (x - x_zp), (y - y_zp) + vx_shifted = IVP_SUBNX16(vx, (int16_t)x_zp); + vy_shifted = IVP_SUBNX16(vy, (int16_t)y_zp); + + // Multiply-accumulate: acc += (x - x_zp) * (y - y_zp) + IVP_MULANX16(acc, vx_shifted, vy_shifted); + } + + // Handle remaining elements with SIMD + int processed = k << LOG2_IVP_SIMD_WIDTH; + int remaining = N - processed; + + if (remaining > 0) { + valign vaX = IVP_LANX8S_PP((const xb_vecNx8 *)pX); + valign vaY = IVP_LANX8S_PP((const xb_vecNx8 *)pY); + + // Load remaining elements with variable alignment + IVP_LAVNX8S_XP(vx, vaX, (const xb_vecNx8 *)pX, remaining); + IVP_LAVNX8S_XP(vy, vaY, (const xb_vecNx8 *)pY, remaining); + + // Subtract zero-points in 16-bit + vx_shifted = IVP_SUBNX16(vx, (int16_t)x_zp); + vy_shifted = IVP_SUBNX16(vy, (int16_t)y_zp); + + // Create mask for valid elements (true for indices < remaining) + vboolN mask = IVP_LTNX16(IVP_SEQNX16(), remaining); + + // Zero out invalid positions: keep valid values, replace invalid with 0 + vx_shifted = IVP_MOVNX16T(vx_shifted, IVP_ZERONX16(), mask); + vy_shifted = IVP_MOVNX16T(vy_shifted, IVP_ZERONX16(), mask); + + // Multiply-accumulate for tail (accumulate into same acc) + // Invalid positions are 0*0 = 0, so they don't contribute + IVP_MULANX16(acc, vx_shifted, vy_shifted); + } + + // Reduce accumulator to single int32 (after all elements processed) + int32_t result = IVP_RADDNX32W_EMULATED(acc); + + // Add initial accumulator value + result += init_acc; + + return result; +} diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c index 27487c75d6c..7e85a8b9c73 100644 --- a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c +++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c @@ -63,33 +63,23 @@ y[N] result, Q7.8 or floating point x,y Must not overlap -------------------------------------------------------------------------*/ -#define IVP_ADDSN_2X32(b_, c_) \ - ({ \ - xb_vecN_2x32v a_; \ - xb_vecN_2x64w tmp_a_; \ - tmp_a_ = IVP_MULN_2X32(b_, 1); \ - IVP_MULAN_2X32(tmp_a_, c_, 1); \ - a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \ - a_; \ - }) - #if !HAVE_VFPU -DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N)) +DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t *x, int N)) #else -void vsoftmaxf(float32_t* y, const float32_t* x, int N) { +void vsoftmaxf(float32_t *y, const float32_t *x, int N) { #if !defined(IVP_MULN_2X32) #else - const int* pTbl = (const int*)expftbl_Q30; + const int *pTbl = (const int *)expftbl_Q30; #endif - const xb_vecN_2xf32* restrict pX; - xb_vecN_2xf32* restrict pY; + const xb_vecN_2xf32 *restrict pX; + xb_vecN_2xf32 *restrict pY; xb_vecN_2xf32 norm, ysum, xmax; int n; valign al_X, al_R, al_Y; if (N < 0) return; xmax = minusInff.f; - pX = (const xb_vecN_2xf32*)x; + pX = (const xb_vecN_2xf32 *)x; al_X = IVP_LAN_2XF32_PP(pX); al_Y = IVP_ZALIGN(); for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) { @@ -99,17 +89,17 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) { } if (N & (IVP_SIMD_WIDTH / 2 - 1)) { xb_vecN_2xf32 x; - IVP_LAVN_2XF32_XP( - x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); - IVP_MAXNUMN_2XF32T( - xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); + IVP_LAVN_2XF32_XP(x, al_X, pX, + sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_MAXNUMN_2XF32T(xmax, xmax, x, + IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); } xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0); __Pragma("no_reorder"); ysum = 0.f; - pX = (const xb_vecN_2xf32*)x; - pY = (xb_vecN_2xf32*)y; + pX = (const xb_vecN_2xf32 *)x; + pY = (xb_vecN_2xf32 *)y; al_X = IVP_LAN_2XF32_PP(pX); { vboolN_2 bnan; @@ -163,8 +153,8 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) { } if (N & (IVP_SIMD_WIDTH / 2 - 1)) { xb_vecN_2xf32 x; - IVP_LAVN_2XF32_XP( - x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_LAVN_2XF32_XP(x, al_X, pX, + sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); x = IVP_SUBN_2XF32(x, xmax); bnan |= IVP_UNN_2XF32(x, x); { @@ -206,18 +196,18 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) { zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp)); x = zout; } - IVP_ADDN_2XF32T( - ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); - IVP_SAVN_2XF32_XP( - x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_ADDN_2XF32T(ysum, ysum, x, + IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); + IVP_SAVN_2XF32_XP(x, al_Y, pY, + sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); } IVP_SAPOSN_2XF32_FP(al_Y, pY); ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan); } norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum)); __Pragma("no_reorder"); - pX = (const xb_vecN_2xf32*)y; - pY = (xb_vecN_2xf32*)y; + pX = (const xb_vecN_2xf32 *)y; + pY = (xb_vecN_2xf32 *)y; al_R = IVP_LAN_2XF32_PP(pX); @@ -229,11 +219,11 @@ void vsoftmaxf(float32_t* y, const float32_t* x, int N) { } if (N & (IVP_SIMD_WIDTH / 2 - 1)) { xb_vecN_2xf32 x; - IVP_LAVN_2XF32_XP( - x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_LAVN_2XF32_XP(x, al_R, pX, + sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); x = IVP_MULN_2XF32(x, norm); - IVP_SAVN_2XF32_XP( - x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_SAVN_2XF32_XP(x, al_Y, pY, + sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); } IVP_SAPOSN_2XF32_FP(al_Y, pY); diff --git a/backends/cadence/vision/third-party/library/dma.c b/backends/cadence/vision/third-party/library/dma.c new file mode 100644 index 00000000000..199c9a5debf --- /dev/null +++ b/backends/cadence/vision/third-party/library/dma.c @@ -0,0 +1,62 @@ +/* + * dma.c + * + * Created on: Oct 30, 2025 + * Author: sraut + */ + +#include "lib.h" + +// We assume that the DSP uses multichannel IDMA with 2 channels available for 2D transfers (e.g., ping-pong buffers) +// and 1 channel for 3D transfers. + +IDMA_BUFFER_DEFINE(buffer_idma_ch0, 2 * CHL_MAX, IDMA_2D_DESC); +IDMA_BUFFER_DEFINE(buffer_idma_ch1, 2 * CHL_MAX, IDMA_2D_DESC); +IDMA_BUFFER_DEFINE(buffer_idma_ch3, 2 * CHL_MAX, IDMA_64B_DESC); + +idma_buffer_t * descbuf[] = { + buffer_idma_ch0, + buffer_idma_ch1, +}; + +// Pointers to DRAM buffers used by softmax +void *ptr_dram0 = (void *)dram0_pool; +void *ptr_dram1 = (void *)dram1_pool; + +void err_cb_func(const idma_error_details_t *error) { + (void) error; +} + +void dma_3dm_init(int ch) { + idma_init(ch, 0, MAX_BLOCK_16, 16, TICK_CYCLES_8, 100000, err_cb_func); + idma_init_loop(ch, buffer_idma_ch3, IDMA_64B_DESC, CHL_MAX, NULL, NULL); +} + +void dma_2dm_init(int ch) { + idma_init(ch, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, err_cb_func); + idma_init_loop(ch, descbuf[ch], IDMA_2D_DESC, CHL_MAX, NULL, NULL); +} + +void dma_3dm(int ch, void *src, void *dst, int src_row_pitch, int dst_row_pitch, + int src_tile_pitch, int dst_tile_pitch, int row_sz, + int nrows, int ntiles) { + (void) idma_copy_3d_desc64(ch, &dst, &src, DESC_IDMA_PRIOR_L /*Default*/, row_sz, + nrows, ntiles, src_row_pitch, dst_row_pitch, + src_tile_pitch, dst_tile_pitch); +} + + +void dma_2dm(int ch,void *_psrc,void *_pdst, int src_stride, int dst_stride, + int num_bytes, short num_lines) { + (void) idma_copy_2d_desc(ch, _pdst, _psrc, num_bytes, + DESC_IDMA_PRIOR_L /*Default*/, num_lines, src_stride, + dst_stride); +} + +void dma_1dm(int ch,void *_psrc,void *_pdst, int num_bytes) { + (void) idma_copy_2d_desc(ch, _pdst, _psrc, num_bytes, DESC_IDMA_PRIOR_L /*Default*/, + 1, 0, 0); +} + + + diff --git a/backends/cadence/vision/third-party/library/memory_manager.c b/backends/cadence/vision/third-party/library/memory_manager.c new file mode 100644 index 00000000000..14e0eeddd6a --- /dev/null +++ b/backends/cadence/vision/third-party/library/memory_manager.c @@ -0,0 +1,44 @@ +/* + * memory_manager.c + * + * Created on: Dec 8, 2025 + * Author: Suraj Raut + * + * Description: Definition of DRAM memory pools and local SRAM scratch buffer. + * These must be defined in exactly one compilation unit. + */ + +#include "lib.h" +#include // For XCHAL_IVPN_SIMD_WIDTH + +// Memory pools placed in specific DRAM sections +// These are the actual storage for the DRAM pools +__attribute__((section(".dram0.data"))) __attribute__((aligned(64*2))) +uint8_t dram0_pool[IDMA_BUFFER_SIZE_DRAM0]; + +__attribute__((section(".dram1.data"))) __attribute__((aligned(64*2))) +uint8_t dram1_pool[IDMA_BUFFER_SIZE_DRAM1]; + +// Cache-mode padded input buffer (in system memory) +// Used by cache-mode kernels for edge padding before convolution +// This buffer is accessed through the processor's data cache +__attribute__((aligned(64*2))) +int8_t cache_padded_input[CACHE_PADDED_INPUT_SIZE]; // 1 MB max + +/** + * Allocate DRAM buffer with SIMD alignment + */ +int8_t* allocate_dram_buffer(int size, int dram_bank, int* dram0_used, int* dram1_used) { + int8_t* ptr; + int aligned_size = (size + (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) & ~(2 * XCHAL_IVPN_SIMD_WIDTH - 1); + + if (dram_bank == 0) { + ptr = (int8_t*)(dram0_pool + *dram0_used); + *dram0_used += aligned_size; + } else { + ptr = (int8_t*)(dram1_pool + *dram1_used); + *dram1_used += aligned_size; + } + + return ptr; +} diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c index f1c6f3d44ae..0ed5dd22257 100644 --- a/backends/cadence/vision/third-party/library/tables/expf_tbl.c +++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c @@ -42,28 +42,22 @@ p(order)=p(order)-(sum(p)-2); */ const int32_t ALIGN_2SIMD expftbl_Q30[8] = { - 234841, - 1329551, - 10400465, - 59570027, - 257946177, - 744260763, - 1073741824, - 0 /* Padding to allow for vector loads */ + 234841, 1329551, 10400465, 59570027, + 257946177, 744260763, 1073741824, 0 /* Padding to allow for vector loads */ }; const union ufloat32uint32 ALIGN_2SIMD expfminmax[2] = /* minimum and maximum arguments of expf() input */ { {0xc2ce8ed0}, /*-1.0327893066e+002f */ - {0x42b17218} /* 8.8722839355e+001f */ + {0x42b17218} /* 8.8722839355e+001f */ }; const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */ const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = { {0x3fb8aa3b}, /* 1.4426950216 */ - {0x32a57060} /* 1.9259629891e-008 */ + {0x32a57060} /* 1.9259629891e-008 */ }; /* @@ -76,10 +70,5 @@ p(order)=p(order)-(sum(p)-2); num2hex(single(p)); */ const union ufloat32uint32 ALIGN_2SIMD expftblf[] = { - {0x39655635}, - {0x3aa24c7a}, - {0x3c1eb2d1}, - {0x3d633ddb}, - {0x3e75ff24}, - {0x3f317212}, - {0x3f800000}}; + {0x39655635}, {0x3aa24c7a}, {0x3c1eb2d1}, {0x3d633ddb}, + {0x3e75ff24}, {0x3f317212}, {0x3f800000}}; diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c index 8464ee9f549..9b2bf62e6bf 100644 --- a/backends/cadence/vision/third-party/library/tables/inff_tbl.c +++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c @@ -31,7 +31,7 @@ #include "dtypes.h" const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */ -const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */ +const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */ const union ufloat32uint32 realmaxf = { 0x7f7fffff}; /* maximum floating point number */ const union ufloat32uint32 realminf = { diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c index f165234fce4..27c5f437b9a 100644 --- a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c +++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c @@ -27,9 +27,9 @@ */ /* Portable data types. */ +#include "dtypes.h" /* NaN values for single precision routines. */ #include "nanf_tbl.h" -#include "dtypes.h" const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN */ const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN */ diff --git a/backends/cadence/vision/third-party/library/utils.c b/backends/cadence/vision/third-party/library/utils.c new file mode 100644 index 00000000000..05366f88b2e --- /dev/null +++ b/backends/cadence/vision/third-party/library/utils.c @@ -0,0 +1,26 @@ +/* + * utils.c + * + * Created on: Nov 4, 2025 + * Author: sraut + */ + +#include +#include +#include +#include + + + +//static inline int inc_iter_to_temp(int *temp, int var, int bound, int carry) { +// int new_val = var + carry; +// carry = new_val == bound; +// *temp = carry ? 0 : new_val; +// return carry; +//} +// +//static inline void swap_buffers(uint8_t **a, uint8_t **b) { +// uint8_t *t = *a; +// *a = *b; +// *b = t; +//} diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c new file mode 100644 index 00000000000..fba6f8bcef4 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv.c @@ -0,0 +1,1668 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include +#if ((XCHAL_VISION_TYPE >= 6)) + +/****************************************************************************** + * 3D convolution general version + * Calls a specific convolution function based on parameters + *****************************************************************************/ +XAI_ERR_TYPE xaiConvolve3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetConvolve3DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fConvPtr)(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params* param); + + /* Getting the function pointer of the convolution variant using xaiGetConvolve3DVariant function */ + fConvPtr xaiConvolve3D_opt = (fConvPtr) xaiGetConvolve3DVariant(inTile, coeffTile, biasArray, outTile, param); + + if (xaiConvolve3D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiConvolve3D_opt(inTile, coeffTile, biasArray, outTile, param)); + } +} + +/************************************************************************************** +* 3D convolution helper function +* Returns the function pointer of a specific convolution variant based on parameters +**************************************************************************************/ +XAI_ERR_TYPE *xaiGetConvolve3DVariant(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(NULL); + } + + xai_cnn_data_order coeffOrder = XAI_TILE4D_GET_DATA_ORDER(coeffTile); + + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + int32_t kWidth, kHeight; + uint8_t stride; + + if (coeffOrder == XAI_NDWH) + { + /* MOD variants */ + kWidth = XAI_TILE4D_GET_DIM3(coeffTile); + kHeight = XAI_TILE4D_GET_DIM4(coeffTile); + + if (inOrder == XAI_WHD) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH); + } + } + else if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH); + } + } + } + } + else if (coeffOrder == XAI_WHDN) + { + /* MOW variants */ + stride = XAI_CNN_CONV_GET_STRIDE(param); + kWidth = XAI_TILE4D_GET_DIM1(coeffTile); + kHeight = XAI_TILE4D_GET_DIM2(coeffTile); + + if (kWidth == 1 && kHeight == 1) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j1_U8S8IX_MOW_WHD); + } + if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j2_U8S8IX_MOW_WHD); + } + if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_1x1j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j1_U8S8IX_MOW_WHD); + } + if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j2_U8S8IX_MOW_WHD); + } + if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_3x3j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j1_U8S8IX_MOW_WHD); + } + if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j2_U8S8IX_MOW_WHD); + } + if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_5x5j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 7 && kHeight == 7) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j1_U8S8IX_MOW_WHD); + } + if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j2_U8S8IX_MOW_WHD); + } + if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_7x7j4_U8S8IX_MOW_WHD); + } + } + } + else + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj1_U8S8IX_MOW_WHD); + } + if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj2_U8S8IX_MOW_WHD); + } + if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxNj4_U8S8IX_MOW_WHD); + } + } + } + } + else if (coeffOrder == XAI_DWHN) + { + /* SO variants */ + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_S8S8IX_SO_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiConvolve3D_S_MxN_U8S8IX_SO_DWH); + } + } + + return(NULL); +} + +/****************************************************************************** + * 3D convolution general version for dilation functions + * Calls a specific dilated convolution function based on parameters + *****************************************************************************/ +XAI_ERR_TYPE xaiConvolved3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetConvolved3DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fConvdPtr)(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params* param); + + /* Getting the function pointer of the convolution variant using xaiGetConvolved3DVariant function*/ + fConvdPtr xaiConvolve3D_opt = + (fConvdPtr) xaiGetConvolved3DVariant(inTile, coeffTile, biasArray, outTile, param); + + if (xaiConvolve3D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiConvolve3D_opt(inTile, coeffTile, biasArray, outTile, param)); + } +} + +/********************************************************************************************* +* 3D dilated convolution helper function +* Returns the function pointer of a specific dilated convolution variant based on parameters +*********************************************************************************************/ +XAI_ERR_TYPE *xaiGetConvolved3DVariant(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(NULL); + } + + uint8_t stride; + uint8_t dilation; + xai_cnn_data_order coeffOrder = XAI_TILE4D_GET_DATA_ORDER(coeffTile); + + + int32_t kWidth, kHeight; + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + + if (coeffOrder == XAI_NDWH) + { + /* MOD variants */ + kWidth = XAI_TILE4D_GET_DIM3(coeffTile); + kHeight = XAI_TILE4D_GET_DIM4(coeffTile); + + if (inOrder == XAI_WHD) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 2 && kHeight == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 4 && kHeight == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH); + } + } + else if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8)) + { + if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param)) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \ + && XAI_CNN_CONV_GET_STRIDE(param) != 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 2 && kHeight == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 4 && kHeight == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8)) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S16)) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S16S16I16_MOD_DWH); + } + } + } + else if (coeffOrder == XAI_WHDN) + { + /* MOW variants */ + stride = XAI_CNN_CONV_GET_STRIDE(param); + dilation = XAI_CNN_CONV_GET_DILATION(param); + kWidth = XAI_TILE4D_GET_DIM1(coeffTile); + kHeight = XAI_TILE4D_GET_DIM2(coeffTile); + + if (kWidth == 1 && kHeight == 1) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j1d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } +#if 0 /* F16 disabled - no implementation available */ + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_1x1j1d1_F16_MOW_WHD); + } + } + } +#endif + } + else if (kWidth == 2 && kHeight == 2) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2j1d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2j1d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } +#if 0 /* F16 disabled - no implementation available */ + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_2x2j1d1_F16_MOW_WHD); + } + } + } +#endif + } + else if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } +#if 0 /* F16 disabled - no implementation available */ + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j1d1_F16_MOW_WHD); + } + } + if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_3x3j2d1_F16_MOW_WHD); + } + } + } +#endif + } + else if (kWidth == 4 && kHeight == 4) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4j1d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_4x4j1d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_5x5j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 7 && kHeight == 7) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_7x7j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } +#if 0 /* F16 disabled - no implementation available */ + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxNj1d1_F16_MOW_WHD); + } + } + } +#endif + } + } + else if (coeffOrder == XAI_DWHN) + { + /* SO variants */ + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_S8S8IX_SO_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiConvolved3D_S_MxN_U8S8IX_SO_DWH); + } + } + + return(NULL); +} + +/****************************************************************************** + * Depthwise convolution general version + * Calls a specific depthwise convolution function based in parameters + *****************************************************************************/ +XAI_ERR_TYPE xaiDepthwiseConvolve2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolve2DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fDepthwiseConvPtr)(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params* param); + + /* Getting the function pointer of the convolution variant using xaiGetDepthwiseConvolve2DVariant function */ + fDepthwiseConvPtr xaiDepthwiseConvolve2D_opt = (fDepthwiseConvPtr) xaiGetDepthwiseConvolve2DVariant(inTile, + coeffTile, + biasArray, + outTile, + param); + + if (xaiDepthwiseConvolve2D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiDepthwiseConvolve2D_opt(inTile, coeffTile, biasArray, outTile, param)); + } +} + +/************************************************************************************** +* 2D depthwise convolution helper function +* Returns the function pointer of a specific depthwiseconvolution variant based +* on parameters +**************************************************************************************/ +XAI_ERR_TYPE *xaiGetDepthwiseConvolve2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!outTile) || (!param)) + { + return(NULL); + } + if (!(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile))) + { + return(NULL); + } + + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile); + int32_t kWidth, kHeight; + + if (coeffOrder == XAI_DWH) + { + /* MOD variants */ + kWidth = XAI_TILE3D_GET_DIM2(coeffTile); + kHeight = XAI_TILE3D_GET_DIM3(coeffTile); + if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param)) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \ + && XAI_CNN_CONV_GET_STRIDE(param) != 4) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + else if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3_S8S8IXCa2_MOD_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3_U8S8IXCa2_MOD_DWH); + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5_S8S8IXCa2_MOD_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5_U8S8IXCa2_MOD_DWH); + } + } + else if (kWidth == 7 && kHeight == 7 && XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7_S8S8IXCa2_MOD_DWH); + } + else + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */ + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxN_S16S16I16_MOD_DWH); + } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))*/ + } /* if(inOrder == XAI_DWH) */ + } /* if(coeffOrder == XAI_DWH) */ + else if (coeffOrder == XAI_WHD) + { + /* MOW variants */ + uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + kWidth = XAI_TILE3D_GET_DIM1(coeffTile); + kHeight = XAI_TILE3D_GET_DIM2(coeffTile); + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_3x3j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_5x5j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 7 && kHeight == 7) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_7x7j4_U8S8IX_MOW_WHD); + } + } + } + else + { + /* MOW Variants */ + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj4_U8S8IX_MOW_WHD); + } + } + } +/* #if XCHAL_VISION_QUAD_MAC_TYPE != 0 */ + } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */ + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj1_S16S16I16_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj2_S16S16I16_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolve2D_S_MxNj4_S16S16I16_MOW_WHD); + } + } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) */ + } /* if(coeffOrder == XAI_WHD) */ + return(NULL); +} + +/****************************************************************************** + * Depthwise dilated convolution general version + * Calls a specific depthwise dilated convolution function based in parameters + *****************************************************************************/ +XAI_ERR_TYPE xaiDepthwiseConvolved2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolved2DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fDepthwiseConvdPtr)(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params* param); + + /* Getting the function pointer of the convolution variant using xaiGetDepthwiseConvolved2DVariant function */ + fDepthwiseConvdPtr xaiDepthwiseConvolved2D_opt = (fDepthwiseConvdPtr) xaiGetDepthwiseConvolved2DVariant(inTile, coeffTile, biasArray, outTile, param); + + if (xaiDepthwiseConvolved2D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiDepthwiseConvolved2D_opt(inTile, coeffTile, biasArray, outTile, param)); + } +} + +/************************************************************************************** +* 2D depthwise convolution helper function +* Returns the function pointer of a specific depthwiseconvolution variant based +* on parameters +**************************************************************************************/ +XAI_ERR_TYPE *xaiGetDepthwiseConvolved2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(NULL); + } + + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile); +#if (XCHAL_HAVE_SUPERGATHER == 0) + int32_t depthMultiplier = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param); +#endif + uint8_t stride; + uint8_t dilation; + + int32_t kWidth, kHeight; + if (coeffOrder == XAI_DWH) + { + kWidth = XAI_TILE3D_GET_DIM2(coeffTile); + kHeight = XAI_TILE3D_GET_DIM3(coeffTile); + /* MOD variants */ + if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxN_U8S8IX_MOD_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { +#if (XCHAL_HAVE_SUPERGATHER == 0) + if (kWidth == 3 && kHeight == 3 && depthMultiplier != 8) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_3x3_S8S8IX_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5 && depthMultiplier != 8) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_5x5_S8S8IX_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7 && depthMultiplier != 8) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_7x7_S8S8IX_MOD_DWH); + } +#else + if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_3x3_S8S8IX_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_5x5_S8S8IX_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_7x7_S8S8IX_MOD_DWH); + } +#endif + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxN_S8S8IX_MOD_DWH); + } + } + else /* (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) */ + { + if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_3x3_S16S16I16_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_5x5_S16S16I16_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxN_S16S16I16_MOD_DWH); + } + } + } + } + /*else*/ if (coeffOrder == XAI_WHD) + { + /* MOW variants */ + + stride = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param); + dilation = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param); +//#endif + /*if(kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d2_S8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d4_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d2_U8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_3x3j1d4_U8S8IX_MOW_WHD); + } + } + } + }*/ + /*else if(kWidth == 5 && kHeight == 5) + { + if (xaiTile3DCheckType(inTile, XAI_S8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d2_S8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d4_S8S8IX_MOW_WHD); + } + } + } + else if (xaiTile3DCheckType(inTile, XAI_U8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d2_U8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_5x5j1d4_U8S8IX_MOW_WHD); + } + } + } + } + else if(kWidth == 7 && kHeight == 7) + { + if (xaiTile3DCheckType(inTile, XAI_S8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d2_S8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d4_S8S8IX_MOW_WHD); + } + } + } + else if (xaiTile3DCheckType(inTile, XAI_U8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d2_U8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolved2D_S_7x7j1d4_U8S8IX_MOW_WHD); + } + } + } + }*/ + /* else */ + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d4_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolved2D_S_MxNj1d4_U8S8IX_MOW_WHD); + } + } + } +//#endif + } + } + return(NULL); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c new file mode 100644 index 00000000000..6a704532585 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOD.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +/****************************************************************************************** +* MOD WHD variants +******************************************************************************************/ + + +/***************************************************************************** +* xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution . */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MOD DWH variants +******************************************************************************************/ + + + +/***************************************************************************** +* xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +XAI_ERR_TYPE xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Stride values = 1, 2 and 4 are supported. */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/******************************* end of MOD variants ***************************************/ +/*******************************************************************************************/ +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c new file mode 100644 index 00000000000..1e9a385aa63 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_conv_MOW.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_conv_MOW.h" +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h new file mode 100644 index 00000000000..eec1cedea17 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_MOW.h @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#if ((XCHAL_VISION_TYPE >= 6)) + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix) name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix + +#if INPUT_DATA_TYPE == UNSIGNED8BIT + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, U8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8U +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8U_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8U_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8U_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8U_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8U_XP +#define MORPH_OP_MULA IVP_MULUSA2N8XR16 +#define MORPH_OP_MUL4TA IVP_MULUS4TA2N8XR8 +#define MORPH_OP_MULQA IVP_MULUSQA2N8XR8 +#define MORPH_OP_MULPA IVP_MULUSPA2N8XR16 + +#elif INPUT_DATA_TYPE == SIGNED8BIT + +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_2Nx8 +#undef MORPH_OP_PRIME_2Nx8 +#undef MORPH_OP_ALIGN_LOAD_2Nx8 +#undef MORPH_OP_LOAD_2Nx8_IP +#undef MORPH_OP_LOAD_2Nx8_VARIABLE +#undef MORPH_OP_LOAD_2Nx8 +#undef MORPH_OP_MULA +#undef MORPH_OP_MUL4TA +#undef MORPH_OP_MULQA +#undef MORPH_OP_MULPA + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, S8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8 +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8_XP +#define MORPH_OP_MULA IVP_MULA2N8XR16 +#define MORPH_OP_MUL4TA IVP_MUL4TA2N8XR8 +#define MORPH_OP_MULQA IVP_MULQA2N8XR8 +#define MORPH_OP_MULPA IVP_MULPA2N8XR16 +#endif + +/****************************************************************************************** +* MOW Stride 1 varaints +******************************************************************************************/ + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_1x1j1, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 1x1 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_1x1j1_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_1x1j1_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_1x1j1, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_1x1j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_3x3j1, S8IX_MOW_WHD) +* **************************************************************************/ + +/******************** xaiConvolve3D_S_3x3j1_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_3x3j1_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_3x3j1, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_3x3j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + + + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_5x5j1, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_5x5j1_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_5x5j1_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_5x5j1, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_5x5j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_7x7j1, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_7x7j1_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_7x7j1_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_7x7j1, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_7x7j1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_MxNj1, S8IX_MOW_WHD) +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate MxN 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_MxNj1_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_MxNj1_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxNj1, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_MxNj1d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MOW Stride 2 varaints +******************************************************************************************/ + + +/***************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_1x1j2, S8IX_MOW_WHD) +* **************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 1x1 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_1x1j2_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_1x1j2_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_1x1j2, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_1x1j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_3x3j2, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 3x3 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_3x3j2_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_3x3j2_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_3x3j2, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_3x3j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_5x5j2, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_5x5j2_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_5x5j2_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_5x5j2, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_5x5j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_7x7j2, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_7x7j2_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_7x7j2_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_7x7j2, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_7x7j2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_MxNj2, S8IX_MOW_WHD) +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate MxN 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_MxNj2_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_MxNj2_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxNj2, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_MxNj2d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MOW Stride 4 varaints +******************************************************************************************/ + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_1x1j4, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 1x1 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_1x1j4_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_1x1j4_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_1x1j4, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_1x1j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_3x3j4, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 3x3 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_3x3j4_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_3x3j4_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_3x3j4, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_3x3j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_5x5j4, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_5x5j4_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_5x5j4_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_5x5j4, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_5x5j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_7x7j4, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_7x7j4_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_7x7j4_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_7x7j4, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_7x7j4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + + +/****************************************************************************************** +* MAKE_NAME(xaiConvolve3D_S_MxNj4, S8IX_MOW_WHD) +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate MxN 3D convolution function for U8 bit and */ +/* S8 bit input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************** xaiConvolve3D_S_MxNj4_S8S8IX_MOW_WHD *********************/ +/******************** xaiConvolve3D_S_MxNj4_U8S8IX_MOW_WHD *********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxNj4, S8IX_MOW_WHD) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + + return(MAKE_NAME(xaiConvolved3D_S_MxNj4d1, S8IX_MOW_WHD) (inTile, coeffTile, biasArray, outTile, param)); + + return(XAI_ERROR_STATUS()); +} + +/********************************** end of MOW variants ************************************/ +/*******************************************************************************************/ +#endif //if ((XCHAL_VISION_TYPE >= 6)) + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c new file mode 100644 index 00000000000..366ad6b8b2f --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef DILATED_SO_VQ_CONV + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_conv_SO.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_conv_SO.h" +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h new file mode 100644 index 00000000000..1cd413bba04 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_SO.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix) name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix + +#if INPUT_DATA_TYPE == UNSIGNED8BIT + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, U8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8U +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8U_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8U_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8U_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8U_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8U_XP +#define MORPH_OP_MULA IVP_MULUSA2N8XR16 +#define MORPH_OP_MULPA IVP_MULUSPA2NX8 + + +#elif INPUT_DATA_TYPE == SIGNED8BIT + +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_2Nx8 +#undef MORPH_OP_PRIME_2Nx8 +#undef MORPH_OP_ALIGN_LOAD_2Nx8 +#undef MORPH_OP_LOAD_2Nx8_IP +#undef MORPH_OP_LOAD_2Nx8_VARIABLE +#undef MORPH_OP_LOAD_2Nx8 +#undef MORPH_OP_MULA +#undef MORPH_OP_MULPA + + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, S8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8 +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8_XP +#define MORPH_OP_MULA IVP_MULA2N8XR16 +#define MORPH_OP_MULPA IVP_MULPA2NX8 +#endif + +/****************************************************************************************** +* SO(Single output) variants +******************************************************************************************/ + +/***************************************************************************/ +/* xaiConvolve3D_S_MxN_S8_SO_DWH/xaiConvolve3D_S_MxN_U8_SO_DWH */ +/***************************************************************************/ + +/***********************************************************************/ +/* Description : P6 Optimized implementation of 3D convolution in SO */ +/* Vectorization Approach. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is S8/U8 */ +/* CoeffData is S8 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is close to that of Input Size. */ +/* Input and Output is in DWH format. */ +/* Coeff is in DWHN format. */ +/* dim1Size of Input Tile is equal to dim1Pitch of Input */ +/* Tile. */ +/***********************************************************************/ + +/******************* xaiConvolve3D_S_MxN_S8S8IX_SO_DWH ********************/ +/******************* xaiConvolve3D_S_MxN_U8S8IX_SO_DWH ********************/ + +XAI_ERR_TYPE MAKE_NAME(xaiConvolve3D_S_MxN, S8IX_SO_DWH) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params * param + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(param); + } + + XAI_CNN_CONV_SET_DILATION_XY(param, 1, 1); + return(MAKE_NAME(xaiConvolved3D_S_MxN, S8IX_SO_DWH) (inTile, coeffTile, biasArray, outTile, param)); + return(XAI_ERROR_STATUS()); +} + +/****************************** end of SO variants *****************************************/ +/*******************************************************************************************/ +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c new file mode 100644 index 00000000000..5b173b24002 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_conv_VQ.c @@ -0,0 +1,1371 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +/****************************************************************************** + * 3D VQ convolution general version for dilation functions + * Calls a specific dilated VQ convolution function based on parameters + *****************************************************************************/ +XAI_ERR_TYPE xaiConvolvedVQ3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetConvolved3DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fConvdVQPtr)(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params* param); + + /* Getting the function pointer of the convolution variant using xaiGetConvolved3DVariant function*/ + fConvdVQPtr xaiConvolveVQ3D_opt = + (fConvdVQPtr) xaiGetConvolvedVQ3DVariant(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); + + if (xaiConvolveVQ3D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiConvolveVQ3D_opt(inTile, coeffTile, biasArray, outputScaleArray, outTile, param)); + } +} + +/********************************************************************************************* +* 3D VQ dilated convolution helper function +* Returns the function pointer of a specific dilated convolution variant based on parameters +*********************************************************************************************/ +XAI_ERR_TYPE *xaiGetConvolvedVQ3DVariant(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(NULL); + } + + uint8_t stride; + uint8_t dilation; + xai_cnn_data_order coeffOrder = XAI_TILE4D_GET_DATA_ORDER(coeffTile); + + int32_t kWidth, kHeight; + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + + if (coeffOrder == XAI_NDWH) + { + /* MOD variants */ + kWidth = XAI_TILE4D_GET_DIM3(coeffTile); + kHeight = XAI_TILE4D_GET_DIM4(coeffTile); + + if (inOrder == XAI_WHD) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 2 && kHeight == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 4 && kHeight == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH); + } + } + else if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8)) + { + if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \ + && XAI_CNN_CONV_GET_STRIDE(param) != 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 2 && kHeight == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 4 && kHeight == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S8)) + { + if (kWidth == 1 && kHeight == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16) && XAI_TILE4D_CHECK_TYPE(coeffTile, XAI_S16)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH); + } + } + } + else if (coeffOrder == XAI_WHDN) + { + /* MOW variants */ + stride = XAI_CNN_CONV_GET_STRIDE(param); + dilation = XAI_CNN_CONV_GET_DILATION(param); + kWidth = XAI_TILE4D_GET_DIM1(coeffTile); + kHeight = XAI_TILE4D_GET_DIM2(coeffTile); + if (kWidth == 1 && kHeight == 1) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_1x1j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 2 && kHeight == 2) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2j1d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_2x2j1d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_3x3j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 4 && kHeight == 4) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4j1d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_4x4j1d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_5x5j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else if (kWidth == 7 && kHeight == 7) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_7x7j4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + else + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d4_S8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_U8S8IX_MOW_WHD); + } + else if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d4_U8S8IX_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_U8S8IX_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_U8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } + } + else if (coeffOrder == XAI_DWHN) + { + /* SO variants */ + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH); + } + } +//#else +#if 0 + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + + if (coeffOrder == XAI_DWHN) + { + /* SO variants */ + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH); + } + } + else if (coeffOrder == XAI_NDWH) + { + if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH); + } + } + } + else if (coeffOrder == XAI_WHDN) + { + /* MOW variants */ + stride = XAI_CNN_CONV_GET_STRIDE(param); + dilation = XAI_CNN_CONV_GET_DILATION(param); + + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (stride == 1) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 2) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD); + } + } + else if (stride == 4) + { + if (dilation == 1) + { + return((XAI_ERR_TYPE *) &xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD); + } + } + } + } +#endif + + return(NULL); +} + +/****************************************************************************** + * Depthwise VQ convolution general version + * Calls a specific depthwise VQ convolution function based in parameters + * This is just a dummy function. Actual function will have proper checking + *****************************************************************************/ +XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolve2DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fDepthwiseConvVQPtr)(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params* param); + + /* Getting the function pointer of the VQ depthwise convolution variant using */ + /* xaiGetDepthwiseConvolve2DVariant function */ + fDepthwiseConvVQPtr xaiDepthwiseConvolveVQ2D_opt = (fDepthwiseConvVQPtr) xaiGetDepthwiseConvolveVQ2DVariant(inTile, + coeffTile, + biasArray, + outputScaleArray, + outTile, + param); + + if (xaiDepthwiseConvolveVQ2D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiDepthwiseConvolveVQ2D_opt(inTile, coeffTile, biasArray, outputScaleArray, + outTile, param)); + } +} + +/************************************************************************************** +* Depthwise VQ convolution helper function +* Returns the function pointer of a specific depthwise VQ convolution variant based on parameters +**************************************************************************************/ +XAI_ERR_TYPE *xaiGetDepthwiseConvolveVQ2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!outTile) || (!param)) + { + return(NULL); + } + if (!(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile))) + { + return(NULL); + } + + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile); + int32_t kWidth, kHeight; + + if (coeffOrder == XAI_DWH) + { + /* MOD variants */ + kWidth = XAI_TILE3D_GET_DIM2(coeffTile); + kHeight = XAI_TILE3D_GET_DIM3(coeffTile); + if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (XAI_CNN_CONV_GET_STRIDEX(param) != XAI_CNN_CONV_GET_STRIDEY(param)) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + else if (XAI_CNN_CONV_GET_STRIDE(param) != 1 && XAI_CNN_CONV_GET_STRIDE(param) != 2 \ + && XAI_CNN_CONV_GET_STRIDE(param) != 4) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + else if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3_S8S8IXCa2_MOD_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3_U8S8IXCa2_MOD_DWH); + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5_S8S8IXCa2_MOD_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5_U8S8IXCa2_MOD_DWH); + } + } + else if (kWidth == 7 && kHeight == 7 && XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7_S8S8IXCa2_MOD_DWH); + } + else + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH); + } + } + } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */ + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxN_S16S16I16_MOD_DWH); + } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16))*/ + } /* if(inOrder == XAI_DWH) */ + } /* if (coeffOrder == XAI_DWH) */ + else if (coeffOrder == XAI_WHD) + { + /* MOW variants */ + uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + kWidth = XAI_TILE3D_GET_DIM1(coeffTile); + kHeight = XAI_TILE3D_GET_DIM2(coeffTile); + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + /* MOW variants */ + if (kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_3x3j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 5 && kHeight == 5) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_5x5j4_U8S8IX_MOW_WHD); + } + } + } + else if (kWidth == 7 && kHeight == 7) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_7x7j4_U8S8IX_MOW_WHD); + } + } + } + else + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj1_S8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj2_S8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj4_S8S8IX_MOW_WHD); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj1_U8S8IX_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj2_U8S8IX_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj4_U8S8IX_MOW_WHD); + } + } + } +/* #if XCHAL_VISION_QUAD_MAC_TYPE != 0 */ + } /* if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) */ + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + if (stride == 1) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj1_S16S16I16_MOW_WHD); + } + else if (stride == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj2_S16S16I16_MOW_WHD); + } + else if (stride == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolveVQ2D_S_MxNj4_S16S16I16_MOW_WHD); + } + } /* if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) */ + } /* if(coeffOrder == XAI_WHD) */ + return(NULL); +} + +/****************************************************************************** + * Depthwise dilated VQ convolution general version + * Calls a specific depthwise VQ convolution function based in parameters + * This is just a dummy function. Actual function will have proper checking + *****************************************************************************/ +XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param) +{ + /* The arguments inTile, coeffTile and param are used by xaiGetDepthwiseConvolve2DVariant + * helper function, to derive the appropriate convolution variant */ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(XAI_ERR_NULLARG); + } + + /* Function Pointer */ + typedef XAI_ERR_TYPE (*fDepthwiseConvdVQPtr)(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params* param); + + /* Getting the function pointer of the VQ depthwise dilated convolution variant */ + /* usingxaiGetDepthwiseConvolve2DVariant function */ + fDepthwiseConvdVQPtr xaiDepthwiseConvolvedVQ2D_opt = (fDepthwiseConvdVQPtr) xaiGetDepthwiseConvolvedVQ2DVariant(inTile, + coeffTile, + biasArray, + outputScaleArray, + outTile, + param); + + if (xaiDepthwiseConvolvedVQ2D_opt == NULL) + { + return(XAI_ERR_NO_VARIANT); + } + else + { + return(xaiDepthwiseConvolvedVQ2D_opt(inTile, coeffTile, biasArray, outputScaleArray, + outTile, param)); + } +} + +/************************************************************************************** +* Depthwise dilated VQ convolution helper function +* Returns the function pointer of a specific depthwise dilated VQ convolution variant +* based on parameters +**************************************************************************************/ +XAI_ERR_TYPE *xaiGetDepthwiseConvolvedVQ2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param) +{ + if ((!inTile) || (!coeffTile) || (!param)) + { + return(NULL); + } + + xai_cnn_data_order inOrder = XAI_TILE3D_GET_DATA_ORDER(inTile); + xai_cnn_data_order coeffOrder = XAI_TILE3D_GET_DATA_ORDER(coeffTile); +#if (XCHAL_HAVE_SUPERGATHER == 0) + int32_t depthMultiplier = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param); +#endif + uint8_t stride; + uint8_t dilation; + + int32_t kWidth, kHeight; + if (coeffOrder == XAI_DWH) + { + /* MOD variants */ + kWidth = XAI_TILE3D_GET_DIM2(coeffTile); + kHeight = XAI_TILE3D_GET_DIM3(coeffTile); + if (inOrder == XAI_DWH) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxN_U8S8IX_MOD_DWH); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { +#if (XCHAL_HAVE_SUPERGATHER == 0) + if (kWidth == 3 && kHeight == 3 && depthMultiplier != 8) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_3x3_S8S8IX_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5 && depthMultiplier != 8) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_5x5_S8S8IX_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7 && depthMultiplier != 8) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_7x7_S8S8IX_MOD_DWH); + } +#else + if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_3x3_S8S8IX_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_5x5_S8S8IX_MOD_DWH); + } + else if (kWidth == 7 && kHeight == 7) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_7x7_S8S8IX_MOD_DWH); + } +#endif + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxN_S8S8IX_MOD_DWH); + } + } + else /* (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) */ + { + if (kWidth == 3 && kHeight == 3) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_3x3_S16S16I16_MOD_DWH); + } + else if (kWidth == 5 && kHeight == 5) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_5x5_S16S16I16_MOD_DWH); + } + else + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxN_S16S16I16_MOD_DWH); + } + } + } + } + /*else*/ if (coeffOrder == XAI_WHD) + { + /* MOW variants */ + + stride = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param); + dilation = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param); +//#endif + /*if(kWidth == 3 && kHeight == 3) + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_S8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_U8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_U8S8IX_MOW_WHD); + } + } + } + } + else if(kWidth == 5 && kHeight == 5) + { + if (xaiTile3DCheckType(inTile, XAI_S8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_S8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_S8S8IX_MOW_WHD); + } + } + } + else if (xaiTile3DCheckType(inTile, XAI_U8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_U8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_U8S8IX_MOW_WHD); + } + } + } + } + else if(kWidth == 7 && kHeight == 7) + { + if (xaiTile3DCheckType(inTile, XAI_S8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_S8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_S8S8IX_MOW_WHD); + } + } + } + else if (xaiTile3DCheckType(inTile, XAI_U8)) + { + if(stride == 1) + { + if(dilation == 2) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_U8S8IX_MOW_WHD); + } + else if(dilation == 4) + { + return ((XAI_ERR_TYPE *)&xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_U8S8IX_MOW_WHD); + } + } + } + } + else*/ + { + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (stride == 1) + { + if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_S8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_S8S8IX_MOW_WHD); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (stride == 1) + { + if (dilation == 2) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_U8S8IX_MOW_WHD); + } + else if (dilation == 4) + { + return((XAI_ERR_TYPE *) &xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_U8S8IX_MOW_WHD); + } + } + } +//#endif + } + } + return(NULL); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h new file mode 100644 index 00000000000..22a248658b2 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_AsymQ_S8IX.h @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef MAKE_NAME_IMPL +#undef MAKE_NAME +#undef MORPH_ODT_CHECK_TILE3D +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_OP_SA_IP +#undef MORPH_OP_SAV_XP +#undef MORPH_OP_SAPOS_FP +#undef MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ + +#if OUTPUT_DATA_TYPE == SIGNED8BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## _ ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S8S8) +#define MORPH_ODT_CHECK_TILE3D XAI_CHECK_TILE3D_S8 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vecNx8 +#define MORPH_OP_SA_IP IVP_SANX8S_IP +#define MORPH_OP_SAV_XP IVP_SAVNX8S_XP +#define MORPH_OP_SAPOS_FP IVP_SAPOSNX8S_FP + +#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift) { \ + vecOut = IVP_PACKVRNX48(vecAcc, shift); \ +} + +#elif OUTPUT_DATA_TYPE == UNSIGNED8BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## _ ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S8U8) +#define MORPH_ODT_CHECK_TILE3D XAI_CHECK_TILE3D_U8 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vecNx8U +#define MORPH_OP_SA_IP IVP_SANX8U_IP +#define MORPH_OP_SAV_XP IVP_SAVNX8U_XP +#define MORPH_OP_SAPOS_FP IVP_SAPOSNX8U_FP + +#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift) { \ + vecOut = IVP_PACKVRNX48(vecAcc, shift); \ + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) UCHAR_MAX), (xb_vecNx16) 0); \ +} + +#elif OUTPUT_DATA_TYPE == SIGNED16BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## _ ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S8S16) +#define MORPH_ODT_CHECK_TILE3D XAI_CHECK_TILE3D_S16 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_OP_SA_IP IVP_SANX16_IP +#define MORPH_OP_SAV_XP IVP_SAVNX16_XP +#define MORPH_OP_SAPOS_FP IVP_SAPOSNX16_FP + +#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift) { \ + vecOut = IVP_PACKVRNX48(vecAcc, shift); \ +} + +#elif OUTPUT_DATA_TYPE == UNSIGNED16BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## _ ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S8U16) +#define MORPH_ODT_CHECK_TILE3D XAI_CHECK_TILE3D_U16 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_OP_SA_IP IVP_SANX16U_IP +#define MORPH_OP_SAV_XP IVP_SAVNX16U_XP +#define MORPH_OP_SAPOS_FP IVP_SAPOSNX16U_FP + +#define MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc, shift) { \ + xb_vecN_2x32v hvecAccEven = IVP_PACKVRNX48_0(vecAcc, shift); \ + xb_vecN_2x32v hvecAccOdd = IVP_PACKVRNX48_1(vecAcc, shift); \ + hvecAccEven = IVP_MAXN_2X32(IVP_MINN_2X32(hvecAccEven, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0); \ + hvecAccOdd = IVP_MAXN_2X32(IVP_MINN_2X32(hvecAccOdd, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0); \ + xb_vecNx16U vecAccEven = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(hvecAccEven)); \ + xb_vecNx16U vecAccOdd = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(hvecAccOdd)); \ + vecOut = IVP_SELNX16UI(vecAccOdd, vecAccEven, IVP_SELI_16B_INTERLEAVE_1_EVEN); \ +} +#endif + +/*********************** xaiDataConversion3D_AsymQ_S8IX ************************/ +/* Description : P6 implementation for conversion from either of the following */ +/* 1) S8_SYM to S8_ASYM */ +/* 2) S8_ASYM to S8_SYM */ +/* 3) S8_ASYM to S8_ASYM */ +/* 4) S8_ASYM to U8_SYM */ +/* 5) S8_ASYM to S16_SYM */ +/* 6) S8_ASYM to U16_SYM */ +/* Inputs : Input Tile, fixUp, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/*******************************************************************************/ + +/********************* xaiDataConversion3D_AsymQ_S8S8 *************************/ +/********************* xaiDataConversion3D_AsymQ_S8U8 *************************/ +/********************* xaiDataConversion3D_AsymQ_S8S16 *************************/ +/********************* xaiDataConversion3D_AsymQ_S8U16 *************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D_AsymQ)(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t fixUp, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + MORPH_ODT_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR((fixUp >= SHRT_MIN) && (fixUp <= SHRT_MAX), XAI_ERR_NORM, \ + "\nfixUp = %hi, value must be greater than or equal to -32768 and less than 32768", fixUp); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + MORPH_ODT_SCALAR *pOutput = (MORPH_ODT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* Vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* Loop variables */ + int32_t x, y, z; + + /* Input and Output pointers */ + xb_vecNx8 *restrict pvecIn; + MORPH_ODT_VECTOR *restrict pvecOut; + + /* Input and Output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + /* Accumulators */ + xb_vecNx48 vecAcc1, vecAcc2, vecAcc3, vecAcc4; + + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + + // Assuming that the "fixUpShift" value will reside with S32 range + int32_t fixUpShift = (fixUp << shift); + xb_vecNx48 vecFixUpShift = fixUpShift; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to S16 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S8 bit */ + /* S16 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* Input and Output vectors */ + xb_vecNx16 vecInData; + xb_vecNx16 vecOut; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* Initialize input and output data pointer */ + pvecIn = (xb_vecNx8 *) (pInput + (z * inTilePitch2)); + pvecOut = (MORPH_ODT_VECTOR *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + // Initializing the 48-bit accumulator with the 32-bit "fixUpShift" value + vecAcc1 = vecFixUpShift; + IVP_MULUSANX16(vecAcc1, vecScale, vecInData); + // Packing the outcome to appropriate range + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc1, shift); + + /* Store output data */ + MORPH_OP_SA_IP(vecOut, vaOut, pvecOut); + } + + varlen = (maxLoopCount - x); + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + // Initializing the 48-bit accumulator with the 32-bit "fixUpShift" value + vecAcc1 = vecFixUpShift; + IVP_MULUSANX16(vecAcc1, vecScale, vecInData); + // Packing the outcome to appropriate range + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut, vecAcc1, shift); + + /* Store output data */ + MORPH_OP_SAV_XP(vecOut, vaOut, pvecOut, (varlen * bytesPerPixel)); + MORPH_OP_SAPOS_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[(z * inTilePitch2) + x]; + MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + // Adjusting the input and output data pointers + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1)); + + /* Load Input data */ + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData3, vaInData, pvecIn); + + // Initializing the 48-bit accumulators with the 32-bit "fixUpShift" value + vecAcc1 = vecFixUpShift; + vecAcc2 = vecFixUpShift; + vecAcc3 = vecFixUpShift; + vecAcc4 = vecFixUpShift; + + IVP_MULUSANX16(vecAcc1, vecScale, vecInData0); + IVP_MULUSANX16(vecAcc2, vecScale, vecInData1); + IVP_MULUSANX16(vecAcc3, vecScale, vecInData2); + IVP_MULUSANX16(vecAcc4, vecScale, vecInData3); + + // Packing the outcome to appropriate range + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift); + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut1, vecAcc2, shift); + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut2, vecAcc3, shift); + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut3, vecAcc4, shift); + + /* Store output data */ + MORPH_OP_SA_IP(vecOut0, vaOut, pvecOut); + MORPH_OP_SA_IP(vecOut1, vaOut, pvecOut); + MORPH_OP_SA_IP(vecOut2, vaOut, pvecOut); + MORPH_OP_SAV_XP(vecOut3, vaOut, pvecOut, (varLen * bytesPerPixel)); + MORPH_OP_SAPOS_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[(z * inTilePitch2) + x]; + MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + // Adjusting the input and output data pointers + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1)); + + /* Load input data */ + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + + // Initializing the 48-bit accumulators with the 32-bit "fixUpShift" value + vecAcc1 = vecFixUpShift; + vecAcc2 = vecFixUpShift; + vecAcc3 = vecFixUpShift; + + IVP_MULUSANX16(vecAcc1, vecScale, vecInData0); + IVP_MULUSANX16(vecAcc2, vecScale, vecInData1); + IVP_MULUSANX16(vecAcc3, vecScale, vecInData2); + + // Packing the outcome to appropriate range + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift); + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut1, vecAcc2, shift); + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut2, vecAcc3, shift); + + /* Store output data */ + MORPH_OP_SA_IP(vecOut0, vaOut, pvecOut); + MORPH_OP_SA_IP(vecOut1, vaOut, pvecOut); + MORPH_OP_SAV_XP(vecOut2, vaOut, pvecOut, (varLen * bytesPerPixel)); + MORPH_OP_SAPOS_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[(z * inTilePitch2) + x]; + MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + // Adjusting the input and output data pointers + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1)); + + /* Load input data */ + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + + // Initializing the 48-bit accumulators with the 32-bit "fixUpShift" value + vecAcc1 = vecFixUpShift; + vecAcc2 = vecFixUpShift; + + IVP_MULUSANX16(vecAcc1, vecScale, vecInData0); + IVP_MULUSANX16(vecAcc2, vecScale, vecInData1); + + // Packing the outcome to appropriate range + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift); + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut1, vecAcc2, shift); + + /* Store output data */ + MORPH_OP_SA_IP(vecOut0, vaOut, pvecOut); + MORPH_OP_SAV_XP(vecOut1, vaOut, pvecOut, (varLen * bytesPerPixel)); + MORPH_OP_SAPOS_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[(z * inTilePitch2) + x]; + MORPH_ODT_SCALAR *pOut = &pOutput[(z * outTilePitch2) + x]; + int32_t varLen = (dim1Size - x); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + // Adjusting the input and output data pointers + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1)); + + /* Load input data */ + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + + // Initializing the 48-bit accumulator with the 32-bit "fixUpShift" value + vecAcc1 = vecFixUpShift; + IVP_MULUSANX16(vecAcc1, vecScale, vecInData0); + + // Packing the outcome to appropriate range + MORPH_PACK_ROUND_CLAMP_LIMITS_ASYMQ(vecOut0, vecAcc1, shift); + + /* Store output data */ + MORPH_OP_SAV_XP(vecOut0, vaOut, pvecOut, (varLen * bytesPerPixel)); + MORPH_OP_SAPOS_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h new file mode 100644 index 00000000000..37fb11f0ca8 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I16I8.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef MAKE_NAME_IMPL +#undef MAKE_NAME +#undef MORPH_IDT_TILECHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_OP_PRIME +#undef MORPH_OP_LOAD_IP +#undef MORPH_OP_MUL + +#if INPUT_DATA_TYPE == SIGNED16BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S16I8) +#define MORPH_IDT_TILECHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t +#define MORPH_IDT_VECTOR xb_vecNx16 +#define MORPH_OP_PRIME IVP_LANX16_PP +#define MORPH_OP_LOAD_IP IVP_LANX16_IP +#define MORPH_OP_MUL IVP_MULUSNX16 + +#elif INPUT_DATA_TYPE == UNSIGNED16BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, U16I8) +#define MORPH_IDT_TILECHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t +#define MORPH_IDT_VECTOR xb_vecNx16U +#define MORPH_OP_PRIME IVP_LANX16U_PP +#define MORPH_OP_LOAD_IP IVP_LANX16U_IP +#define MORPH_OP_MUL IVP_MULUUNX16 +#endif + +/********************* xaiDataConversion3D_S16/U16I8 ***************************/ +/* Description : P6 implementation for conversion from S16/U16 to S8 / U8 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed/unsigned 16bit */ +/******************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D_)(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_TILECHECK(inTile); + XAI_CHECK_TILE3D_I8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift value = %hhu, which should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int16_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) ? SCHAR_MIN : 0; + const int16_t maxLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) ? SCHAR_MAX : UCHAR_MAX; + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + MORPH_IDT_VECTOR * restrict pvecIn; + xb_vecNx8U * restrict pvecOut; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 16 bit to S8/U8 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 16 bit to S8/U8 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + MORPH_IDT_VECTOR vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (MORPH_IDT_VECTOR *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2)); + + valign vaInData = MORPH_OP_PRIME(pvecIn); + xb_vecNx16 vecOut; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData), shift); + + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SANX8U_IP(vecOut, vaOut, pvecOut); + } + /* load input data */ + MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData), shift); + + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x)); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + MORPH_IDT_VECTOR vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + pvecIn = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut1 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1), shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut2 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2), shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut3 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData3), shift); + vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + MORPH_IDT_VECTOR vecInData0, vecInData1, vecInData2; + xb_vecNx16 vecOut0, vecOut1, vecOut2; + + pvecIn = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut1 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1), shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut2 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2), shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + MORPH_IDT_VECTOR vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1; + + pvecIn = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut1 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1), shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + MORPH_IDT_VECTOR vecInData0; + xb_vecNx16 vecOut0; + + pvecIn = (MORPH_IDT_VECTOR *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h new file mode 100644 index 00000000000..bddbd3058ca --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_I8I32.h @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef MAKE_NAME_IMPL +#undef MAKE_NAME +#undef MORPH_IDT_TILECHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IDT_VECTORI8 +#undef MORPH_OP_PRIME +#undef MORPH_OP_LOAD_IP +#undef MORPH_OP_MUL + +#if INPUT_DATA_TYPE == SIGNED8BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## _ ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S8I32) +#define MORPH_IDT_TILECHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_VECTOR xb_vecNx16 +#define MORPH_IDT_VECTORI8 xb_vecNx8 +#define MORPH_OP_PRIME IVP_LANX8S_PP +#define MORPH_OP_LOAD_IP IVP_LANX8S_IP +#define MORPH_OP_MUL IVP_MULUSNX16 + +#elif INPUT_DATA_TYPE == UNSIGNED8BIT + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## _ ## MORPH_FNAME_SPECIFIER +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, U8I32) +#define MORPH_IDT_TILECHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t +#define MORPH_IDT_VECTOR xb_vecNx16U +#define MORPH_IDT_VECTORI8 xb_vecNx8U +#define MORPH_OP_PRIME IVP_LANX8U_PP +#define MORPH_OP_LOAD_IP IVP_LANX8U_IP +#define MORPH_OP_MUL IVP_MULUUNX16 +#endif + + +/********************* xaiDataConversion3D_I8I32 ************************/ +/* Description : P6 implementation for conversion from S8 to S32 */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/***********************************************************************/ +/********************* xaiDataConversion3D_S8I32 ************************/ +/********************* xaiDataConversion3D_U8I32 ************************/ +XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D)(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_TILECHECK(inTile); + XAI_CHECK_TILE3D_I32(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? INT_MIN : 0; + + /* loop variables */ + int32_t x, y, z; + + + /* input and output pointers */ + MORPH_IDT_VECTORI8 *restrict pvecIn; + xb_vecN_2x32v *restrict pvecOut; + + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to S16 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S8 bit */ + /* I32 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + MORPH_IDT_VECTOR vecInData; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + xb_vecN_2x32v vecOutL, vecOutH; + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (MORPH_IDT_VECTORI8 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim); + /* store output data */ + IVP_SAN_2X32_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOutH, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + MORPH_OP_LOAD_IP(vecInData, vaInData, pvecIn); + + + xb_vecNx48 vecIntRes = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim); + + /* store output data */ + IVP_SAVN_2X32_XP(vecOutL, vaOut, pvecOut, (varlen << 2)); + IVP_SAVN_2X32_XP(vecOutH, vaOut, pvecOut, ((varlen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + MORPH_IDT_VECTOR vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + + valign vaInData = MORPH_OP_PRIME(pvecIn); + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData3, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecIntRes3 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData3); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut1H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + + vecOut2L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOut2H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3)); + vecOut3L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut3L = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim); + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3)); + vecOut3H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut3H = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim); + + + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut); + + IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR *pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0); + + xb_vecNx48 vecIntRes1 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1); + + xb_vecNx48 vecIntRes2 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData2); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut1H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOut2H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR *pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + MORPH_OP_LOAD_IP(vecInData1, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0); + + xb_vecNx48 vecIntRes1 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData1); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut1H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + MORPH_IDT_SCALAR *pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (MORPH_IDT_VECTORI8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = MORPH_OP_PRIME(pvecIn); + /* load input data */ + MORPH_OP_LOAD_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = MORPH_OP_MUL((xb_vecNx16U) scale, vecInData0); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h new file mode 100644 index 00000000000..a5484d43644 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dataConversion3D_S32IX.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef MAKE_NAME_IMPL +#undef MAKE_NAME +#undef MORPH_ODT_TILECHECK +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MIN_VAL +#undef MAX_VAL +#undef MORPH_STORE_SA_IP +#undef MORPH_STORE_SAV_XP +#undef MORPH_FLUSH_SAPOS +#undef BytesPerPixel + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER) name ## MORPH_FNAME_SPECIFIER + +#if ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == SIGNED8BIT)) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S8) +#define MORPH_ODT_TILECHECK XAI_CHECK_TILE3D_S8 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vecNx8 +#define MIN_VAL SCHAR_MIN +#define MAX_VAL SCHAR_MAX +#define MORPH_STORE_SA_IP IVP_SANX8S_IP +#define MORPH_STORE_SAV_XP IVP_SAVNX8S_XP +#define MORPH_FLUSH_SAPOS IVP_SAPOSNX8S_FP +#define BytesPerPixel 1 + + +#elif ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == UNSIGNED8BIT)) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, U8) +#define MORPH_ODT_TILECHECK XAI_CHECK_TILE3D_U8 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vecNx8U +#define MIN_VAL 0 +#define MAX_VAL UCHAR_MAX +#define MORPH_STORE_SA_IP IVP_SANX8U_IP +#define MORPH_STORE_SAV_XP IVP_SAVNX8U_XP +#define MORPH_FLUSH_SAPOS IVP_SAPOSNX8U_FP +#define BytesPerPixel 1 + +#elif ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == SIGNED16BIT)) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, S16) +#define MORPH_ODT_TILECHECK XAI_CHECK_TILE3D_S16 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MIN_VAL SHRT_MIN +#define MAX_VAL SHRT_MAX +#define MORPH_STORE_SA_IP IVP_SANX16_IP +#define MORPH_STORE_SAV_XP IVP_SAVNX16_XP +#define MORPH_FLUSH_SAPOS IVP_SAPOSNX16_FP +#define BytesPerPixel 2 + +#elif ((INPUT_DATA_TYPE == SIGNED32BIT) && (OUTPUT_DATA_TYPE == UNSIGNED16BIT)) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, U16) +#define MORPH_ODT_TILECHECK XAI_CHECK_TILE3D_U16 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MIN_VAL 0 +#define MAX_VAL USHRT_MAX +#define MORPH_STORE_SA_IP IVP_SANX16U_IP +#define MORPH_STORE_SAV_XP IVP_SAVNX16U_XP +#define MORPH_FLUSH_SAPOS IVP_SAPOSNX16U_FP +#define BytesPerPixel 2 +#endif + + +/********************* xaiDataConversion3D_S32IX ******************************/ +/* Description : P6 implementation for conversion from S32 to S8 /U8/S16/U16 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 32 bit */ +/******************************************************************************/ +/********************* xaiDataConversion3D_S32S8 *****************************/ +/********************* xaiDataConversion3D_S32U8 *****************************/ +/********************* xaiDataConversion3D_S32S16 ******************************/ +/********************* xaiDataConversion3D_S32U16 *****************************/ +XAI_ERR_TYPE MAKE_NAME (xaiDataConversion3D_S32)(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S32(inTile); + MORPH_ODT_TILECHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t minLim = MIN_VAL; + int32_t maxLim = MAX_VAL; + + /* Get Data Pointers */ + int32_t *pInput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + MORPH_ODT_SCALAR *pOutput = (MORPH_ODT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH / 2; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecN_2x32v * restrict pvecIn; + MORPH_ODT_VECTOR * restrict pvecOut; + + xb_vecN_2x64w vec0scaledIn64B, vec1scaledIn64B; + + /* SCALE*/ + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 32 bit to S8/U8 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 32 bit to S8/U8 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecN_2x32v vecInData0, vecInData1; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecN_2x32v *) (pInput + (z * inTilePitch2)); + pvecOut = (MORPH_ODT_VECTOR *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LAN_2X32_PP(pvecIn); + xb_vecNx16 vecOut, vecOut0, vecOut1; + x = 0; + for (; x < maxLoopCount - vectorizationWidth2X; x += vectorizationWidth2X) + { + /* Load input data */ + IVP_LAN_2X32_IP(vecInData0, vaInData, pvecIn); + IVP_LAN_2X32_IP(vecInData1, vaInData, pvecIn); + + /* Multiply U16 scale with S32 input and store in 64-bit wide vector */ + vec0scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData0); + vec1scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData1); + + /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */ + xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift); + xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift); + + /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it + * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/ + vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + + /* Select the actual data present at even lanes, i.e. 0, 2, 4,... */ + vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0); + + /* store output data */ + MORPH_STORE_SA_IP(vecOut, vaOut, pvecOut); + } + + /* Load remaining input data */ + IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, (maxLoopCount - x) * 4); + IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, ((maxLoopCount - x) - (vectorizationWidth >> 1)) * 4); + + /* Multiply U16 scale with S32 input and store in 64-bit wide vector */ + vec0scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData0); + vec1scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData1); + + /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */ + xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift); + xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift); + + /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it + * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/ + vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + + /* Select the actual data present at even lanes, i.e. 0, 2, 4,... */ + vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0); + + /* store output data */ + MORPH_STORE_SAV_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x) * BytesPerPixel); + MORPH_FLUSH_SAPOS(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + for (; x < dim1Size; x += vectorizationWidth2X) /* Load two vectors along 1st dimension*/ + { + /* Initialize input and output data pointer */ + int32_t * pIn = &pInput[z * inTilePitch2 + x]; + MORPH_ODT_SCALAR *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecN_2x32v vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1, vecOut; + + pvecIn = (xb_vecN_2x32v *) (pIn + (y * inTilePitch1)); + pvecOut = (MORPH_ODT_VECTOR *) (pOut + (y * outTilePitch1)); + + /* Load input data */ + valign vaInData = IVP_LAN_2X32_PP(pvecIn); + IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, varLen * 4); + IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, (varLen - (vectorizationWidth >> 1)) * 4); + + /* Multiply U16 scale with S32 input and store in 64-bit wide vector */ + vec0scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData0); + vec1scaledIn64B = IVP_MULUSN_2X16X32_0(vecScale, vecInData1); + + /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */ + xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift); + xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift); + + /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it + * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/ + vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + + /* Select the actual data present at even lanes, i.e. 0, 2, 4,... */ + vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0); + + /* Store output data */ + MORPH_STORE_SAV_XP(vecOut, vaOut, pvecOut, varLen * BytesPerPixel); + MORPH_FLUSH_SAPOS(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} +#endif //#if ((XCHAL_VISION_TYPE >= 6)) + + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c new file mode 100644 index 00000000000..dc7fb38a576 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_datatransform.c @@ -0,0 +1,7835 @@ +/* + * Copyright (c) 2022 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef INPUT_DATA_TYPE +#undef OUTPUT_DATA_TYPE + +#define INPUT_DATA_TYPE INTEGER8BIT +#include "cnn_fill_tile.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE INTEGER16BIT +#include "cnn_fill_tile.h" +#undef INPUT_DATA_TYPE + +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +#define INPUT_DATA_TYPE FLOAT16BIT +#include "cnn_fill_tile.h" +#undef INPUT_DATA_TYPE +#endif + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +#define INPUT_DATA_TYPE FLOAT32BIT +#include "cnn_fill_tile.h" +#undef INPUT_DATA_TYPE +#endif + +#define INPUT_DATA_TYPE INTEGER8BIT +#include "cnn_extend_edge.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE INTEGER16BIT +#include "cnn_extend_edge.h" +#undef INPUT_DATA_TYPE + +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +#define INPUT_DATA_TYPE FLOAT16BIT +#include "cnn_extend_edge.h" +#undef INPUT_DATA_TYPE +#endif + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +#define INPUT_DATA_TYPE FLOAT32BIT +#include "cnn_extend_edge.h" +#undef INPUT_DATA_TYPE +#endif + +#define INPUT_DATA_TYPE SIGNED16BIT +#include "cnn_dataConversion3D_I16I8.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_dataConversion3D_I16I8.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dataConversion3D_I8I32.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dataConversion3D_I8I32.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED32BIT +#define OUTPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dataConversion3D_S32IX.h" +#undef INPUT_DATA_TYPE +#undef OUTPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED32BIT +#define OUTPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dataConversion3D_S32IX.h" +#undef INPUT_DATA_TYPE +#undef OUTPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED32BIT +#define OUTPUT_DATA_TYPE SIGNED16BIT +#include "cnn_dataConversion3D_S32IX.h" +#undef INPUT_DATA_TYPE +#undef OUTPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED32BIT +#define OUTPUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_dataConversion3D_S32IX.h" +#undef INPUT_DATA_TYPE +#undef OUTPUT_DATA_TYPE + +#define OUTPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dataConversion3D_AsymQ_S8IX.h" +#undef OUTPUT_DATA_TYPE + +#define OUTPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dataConversion3D_AsymQ_S8IX.h" +#undef OUTPUT_DATA_TYPE + +#define OUTPUT_DATA_TYPE SIGNED16BIT +#include "cnn_dataConversion3D_AsymQ_S8IX.h" +#undef OUTPUT_DATA_TYPE + +#define OUTPUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_dataConversion3D_AsymQ_S8IX.h" +#undef OUTPUT_DATA_TYPE + +#define PACK_ROUND_U16(vecOut1, vecInData1, Scale, Shift) { \ + xb_vecNx48 acc = IVP_MULUSNX16((xb_vecNx16U) Scale, vecInData1); \ + xb_vecN_2x32v m_outEven = IVP_PACKVRNX48_0(acc, Shift); \ + xb_vecN_2x32v m_outOdd = IVP_PACKVRNX48_1(acc, Shift); \ + m_outEven = IVP_MAXN_2X32(IVP_MINN_2X32(m_outEven, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0); \ + m_outOdd = IVP_MAXN_2X32(IVP_MINN_2X32(m_outOdd, (xb_vecN_2x32v) USHRT_MAX), (xb_vecN_2x32v) 0); \ + xb_vecNx16U temp1 = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outEven)); \ + xb_vecNx16U temp2 = IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outOdd)); \ + vecOut1 = IVP_SELNX16UI(temp2, temp1, IVP_SELI_16B_INTERLEAVE_1_EVEN); \ +} + +/*************************** xaiFillTile3D ***********************************/ +/* Description : General API for FillTile3D optimized implementation */ +/* Calls one of the FillTile3D functions based */ +/* on the parameters */ +/* Inputs : constant value to fill, fillEdgeExtension */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiFillTile3D(xai_pTile3D dstTile, + const int32_t value, + xai_bool fillEdgeExtension) +{ + if (!dstTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U8)) + { + return(xaiFillTile3D_I8(dstTile, value, fillEdgeExtension)); + } + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U16)) + { + return(xaiFillTile3D_I16(dstTile, value, fillEdgeExtension)); + } +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F16)) + { + return(xaiFillTile3D_F16(dstTile, value, fillEdgeExtension)); + } +#endif +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F32)) + { + return(xaiFillTile3D_F32(dstTile, value, fillEdgeExtension)); + } +#endif + return(XAI_ERR_NO_VARIANT); +} + +/************************* xaiExtendEdgesConst3D *****************************/ +/* Description : General API for ExtendEdgesConst3D optimized implementation*/ +/* Calls one of the ExtendEdgesConst3D functions based */ +/* on the parameters */ +/* Inputs : constant value to fill the edges */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiExtendEdgesConst3D(xai_pTile3D dstTile, + const int32_t value, + xai_size3D frame3DSize) +{ + if (!dstTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U8)) + { + return(xaiExtendEdgesConst3D_I8(dstTile, value, frame3DSize)); + } + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U16)) + { + return(xaiExtendEdgesConst3D_I16(dstTile, value, frame3DSize)); + } +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F16)) + { + int16_t valueS16 = (int16_t) value; +#if defined(__XTENSA__) + xb_f16 valueF16; + memcpy(&valueF16, &valueS16, sizeof(int16_t)); +#else + xb_f16 valueF16 = *(xb_f16 *) (&valueS16); +#endif + return(xaiExtendEdgesConst3D_F16(dstTile, valueF16, frame3DSize)); + } +#endif +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F32)) + { + int32_t valueS32 = (int32_t) value; + float valueF32; + memcpy(&valueF32, &valueS32, sizeof(int32_t)); + return(xaiExtendEdgesConst3D_F32(dstTile, valueF32, frame3DSize)); + } +#endif + return(XAI_ERR_NO_VARIANT); +} + +/*********************** xaiExtendEdges3D ********************************/ +/* Description : General API for ExtendEdges3D optimized implementation */ +/* Calls one of the ExtendEdges3D functions based */ +/* on the parameters */ +/* Inputs : pArray, frame3DSize */ +/* Outputs : XI Error Code */ +/* InOuts : Input Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiExtendEdges3D(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + if (!dstTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S8) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U8)) + { + return(xaiExtendEdges3D_I8(dstTile, pArray, frame3DSize)); + } + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(dstTile, XAI_U16)) + { + return(xaiExtendEdges3D_I16(dstTile, pArray, frame3DSize)); + } +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F16)) + { + return(xaiExtendEdges3D_F16(dstTile, pArray, frame3DSize)); + } +#endif +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(dstTile, XAI_F32)) + { + return(xaiExtendEdges3D_F32(dstTile, pArray, frame3DSize)); + } +#endif + return(XAI_ERR_NO_VARIANT); +} + +/************************** xaiCopyTile3D ***********************************/ +/* Description : P6 optimized implementation for copying the contents of a */ +/* 3D tile to another 3D tile. This function supports copying */ +/* of 8/16/32/64 bit input tile data based on data type of */ +/* tile data elements. copy_edge_extension flag is used to */ +/* control copy of edges. If edge sizes are different, then */ +/* minimum of input & output edge size number of elements is */ +/* copied from edges. */ +/* Inputs : Input Tile data, copy_edge_extension, */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : Active data size of input & output tiles are the same */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiCopyTile3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_bool copy_edge_extension) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D(inTile); + XAI_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR((((XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 1) || (XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 2)) || \ + (XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 4) || (XAI_TILE3D_GET_ELEMENT_SIZE(inTile) == 8)), XAI_ERR_DATATYPE, \ + "Element size of Input tile = %d, The argument of input tile has unsupported data type", \ + XAI_TILE3D_GET_ELEMENT_SIZE(inTile)); + XAI_CHECK_TILE3D_ELEMENT_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Getting parameters from the tile structures */ + /* Tile size across first dimension of input tile and output tile is scaled */ + /* based on input data type of tile data elements */ + + const int32_t element_size = XAI_TILE3D_GET_ELEMENT_SIZE(inTile); + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile) * element_size; + const int32_t inDim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(inTile) * element_size; + const int32_t inDim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(inTile) * element_size; + const int32_t outDim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(outTile) * element_size; + const int32_t outDim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(outTile) * element_size; + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile) * element_size; + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile) * element_size; + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile) * element_size; + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile) * element_size; + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inDim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(inTile); + const int32_t inDim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(inTile); + const int32_t inDim3Edge1 = XAI_TILE3D_GET_DIM3_EDGE1(inTile); + const int32_t inDim3Edge2 = XAI_TILE3D_GET_DIM3_EDGE2(inTile); + const int32_t outDim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(outTile); + const int32_t outDim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(outTile); + const int32_t outDim3Edge1 = XAI_TILE3D_GET_DIM3_EDGE1(outTile); + const int32_t outDim3Edge2 = XAI_TILE3D_GET_DIM3_EDGE2(outTile); + /* Vectorization for xaiCopyTile3D function is always done across the first dimension */ + int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + int32_t vectorizationWidth2X = vectorizationWidth * 2; + int32_t vectorizationWidth3X = vectorizationWidth * 3; + int32_t vectorizationWidth4X = vectorizationWidth * 4; + + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + int32_t z, x, y; + int32_t dim1CopySize = dim1Size; + int32_t dim2CopySize = dim2Size; + int32_t dim3CopySize = dim3Size; + int32_t dim1CopyEdge1Size; + int32_t dim2CopyEdge1Size; + int32_t dim3CopyEdge1Size; + int32_t dim1CopyEdge2Size; + int32_t dim2CopyEdge2Size; + int32_t dim3CopyEdge2Size; + int32_t maxLoopCount; + valign vaInData; + valign vaOutData = IVP_ZALIGN(); + xb_vec2Nx8* restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8 vecValue; + + /* If copy_edge_extension flag is enabled update input and output data pointer */ + /* and data copy size across all 3 dimensions. */ + + if (copy_edge_extension) + { + dim1CopyEdge1Size = XT_MIN(inDim1Edge1, outDim1Edge1); + dim2CopyEdge1Size = XT_MIN(inDim2Edge1, outDim2Edge1); + dim3CopyEdge1Size = XT_MIN(inDim3Edge1, outDim3Edge1); + dim1CopyEdge2Size = XT_MIN(inDim1Edge2, outDim1Edge2); + dim2CopyEdge2Size = XT_MIN(inDim2Edge2, outDim2Edge2); + dim3CopyEdge2Size = XT_MIN(inDim3Edge2, outDim3Edge2); + dim1CopySize = dim1Size + dim1CopyEdge1Size + dim1CopyEdge2Size; + dim2CopySize = dim2Size + dim2CopyEdge1Size + dim2CopyEdge2Size; + dim3CopySize = dim3Size + dim3CopyEdge1Size + dim3CopyEdge2Size; + pInput = &pInput[-dim1CopyEdge1Size + ((-dim2CopyEdge1Size) * inDataPitch1) \ + + ((-dim3CopyEdge1Size) * inDataPitch2)]; + pOutput = &pOutput[-dim1CopyEdge1Size + ((-dim2CopyEdge1Size) * outDataPitch1) \ + + ((-dim3CopyEdge1Size) * outDataPitch2)]; + } + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When output tile pitch is equal to output tile copy size. */ + /* - If above condition holds good, memory location to be copied */ + /* from inTile to outTile is contiguous. Hence vectorization can be */ + /* utilized effectively */ + /* 2. When output tile pitch is greater than output tile copy size. */ + /* - If above condition holds good, memory location to be copied */ + /* from inTile to outTile is contiguous. In order to do */ + /* vectorization across first dimension, output data pointers */ + /* need to be updated based on output tile copy size and */ + /* output tile pitch */ + /******************************************************************************/ + + if ((inDataPitch1 == dim1CopySize) && (outDataPitch1 == dim1CopySize)) + { + /* Data to be copied exist in contiguous memory location with respect to */ + /* first dimension */ + + /* Initialize max loop counter */ + int32_t maxdim3LoopCount = dim3CopySize; + maxLoopCount = dim1CopySize * dim2CopySize; + + if ((inDataPitch2 == maxLoopCount) && (outDataPitch2 == maxLoopCount)) + { + /* Data to be filled exist in contiguous memory location with respect to */ + /* first and second dimension */ + + /* Update max loop counter */ + maxdim3LoopCount = 1; + maxLoopCount *= dim3CopySize; + } + for (z = 0; z < maxdim3LoopCount; z++) + { + /* initialize input and output data pointer */ + pdvecIn = (xb_vec2Nx8 *) (pInput + (z * inDataPitch2)); + pdvecOut = (xb_vec2Nx8 *) (pOutput + (z * outDataPitch2)); + vaInData = IVP_LA2NX8_PP(pdvecIn); + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Read vector input data */ + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + /* Store vector output data */ + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + } + + IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, maxLoopCount - x); + IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, maxLoopCount - x); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + else + { + /* else block execute, if output tile pitch is greater than output tile copy size */ + /* or input tile pitch in not equal to output tile pitch */ + + for (z = 0; z < dim3CopySize; z++) /* Loop across dim3 */ + { + x = 0; + /* Loop across dimension 1 */ + + /* Condition check added to maximize vectorization across dimension 1*/ + /* Loop across dim1 */ + for (; x < (dim1CopySize - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* initialize input and output data pointer */ + int8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1CopySize - (x + vectorizationWidth3X); + + for (y = 0; y < dim2CopySize; y++) + { + pdvecIn = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8_PP(pdvecIn); + + /* Read vector data from inTile and copy vector data to outTile */ + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen); + IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + if (x < (dim1CopySize - vectorizationWidth2X)) /* Loop unrolling across dim2 */ + { + /* initialize input and output data pointer */ + int8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1CopySize - (x + vectorizationWidth2X); + for (y = 0; y < dim2CopySize; y++) + { + pdvecIn = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8_PP(pdvecIn); + + /* Read vector data from inTile and copy vector data to outTile */ + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen); + IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + else if (x < (dim1CopySize - vectorizationWidth)) + { + /* initialize input and output data pointer */ + int8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1CopySize - (x + vectorizationWidth); + for (y = 0; y < dim2CopySize; y++) + { + pdvecIn = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8_PP(pdvecIn); + + /* Read vector data from inTile and copy vector data to outTile */ + IVP_LA2NX8_IP(vecValue, vaInData, pdvecIn); + IVP_SA2NX8_IP(vecValue, vaOutData, pdvecOut); + IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen); + IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + else if (x < dim1CopySize) + { + /* initialize input and output data pointer */ + int8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1CopySize - x; + for (y = 0; y < dim2CopySize; y++) + { + pdvecIn = (xb_vec2Nx8 *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8_PP(pdvecIn); + + /* Read vector data from inTile and copy vector data */ + IVP_LAV2NX8_XP(vecValue, vaInData, pdvecIn, varLen); + IVP_SAV2NX8_XP(vecValue, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/************************ xaiUnsignedToSigned3D_U8S8 ******************************/ +/* Description : P6 optimized implementation for converting the tile data from */ +/* unsigned 8bit to signed 8bit. This function can operate */ +/* in-place. Applications needing this function to operate */ +/* in-place can provide the same Input and Output Tiles. */ +/* Inputs : Input Tile */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 8bit */ +/* Unsigned to Signed 8bit conversion not performed on tile edges */ +/*********************************************************************************/ +XAI_ERR_TYPE xaiUnsignedToSigned3D_U8S8(xai_pTile3D inTile, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_TILE3D_S8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + + /* Input and Output Data Pointers */ + uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t maxLoopCount; + + /* xaiUnsignedToSigned3D_U8S8 function support in-place unsigned to signed 8bit */ + /* conversion. In a such a scenario inTile and outTile will be overlapping. */ + /* Hence restrict keyword is not used for input and output data pointers */ + xb_vec2Nx8U* restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + valign vaInData; + valign vaOutData = IVP_ZALIGN(); + xb_vec2Nx8U vecValue1, vecValue2, vecValue3, vecValue4; + xb_vec2Nx8 vecValueSigned1, vecValueSigned2, vecValueSigned3, vecValueSigned4; + const xb_vec2Nx8 signedCharMax = SCHAR_MAX; + + /* Vectorization for xaiUnsignedToSigned3D_U8S8 function */ + /* is always done across the first dimension */ + int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + int32_t vectorizationWidth2X = 2 * vectorizationWidth; + int32_t vectorizationWidth3X = 3 * vectorizationWidth; + int32_t vectorizationWidth4X = 4 * vectorizationWidth; + int32_t x, y, z; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which unsigned */ + /* 8 bit to signed 8 bit conversion need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - If above condition holds good, data elements for which unsigned */ + /* 8 bit to signed 8 bit conversion need to done exist in non-contiguous */ + /* memory location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inDataPitch1 == dim1Size) && (outDataPitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inDataPitch2 == maxLoopCount) && (outDataPitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + dim3MaxLoopCount = 1; /* Update max loop counter */ + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input data pointer */ + pdvecIn = (xb_vec2Nx8U *) (pInput + (z * inDataPitch2)); + /* initialize output data pointer */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (z * outDataPitch2)); + vaInData = IVP_LA2NX8U_PP(pdvecIn); + + for (x = 0; x < maxLoopCount - vectorizationWidth4X; x += vectorizationWidth4X) + { + /* Load Data */ + IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn); + IVP_LA2NX8U_IP(vecValue2, vaInData, pdvecIn); + IVP_LA2NX8U_IP(vecValue3, vaInData, pdvecIn); + IVP_LA2NX8U_IP(vecValue4, vaInData, pdvecIn); + + /* Perform unsigned to signed conversion and rounding off operation */ + vecValue1 = IVP_AVGRU2NX8(vecValue1, 0); + vecValue2 = IVP_AVGRU2NX8(vecValue2, 0); + vecValue3 = IVP_AVGRU2NX8(vecValue3, 0); + vecValue4 = IVP_AVGRU2NX8(vecValue4, 0); + + /* Perform saturation of signed max value */ + vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1); + vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2); + vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3); + vecValueSigned4 = IVP_MINU2NX8U(signedCharMax, vecValue4); + + /* Store Data */ + IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut); + IVP_SA2NX8_IP(vecValueSigned2, vaOutData, pdvecOut); + IVP_SA2NX8_IP(vecValueSigned3, vaOutData, pdvecOut); + IVP_SA2NX8_IP(vecValueSigned4, vaOutData, pdvecOut); + } + /* Load remaining data */ + IVP_LAV2NX8U_XP(vecValue1, vaInData, pdvecIn, maxLoopCount - (x + vectorizationWidth3X)); + IVP_LAV2NX8U_XP(vecValue2, vaInData, pdvecIn, maxLoopCount - (x + vectorizationWidth2X)); + IVP_LAV2NX8U_XP(vecValue3, vaInData, pdvecIn, maxLoopCount - (x + vectorizationWidth)); + IVP_LAV2NX8U_XP(vecValue4, vaInData, pdvecIn, maxLoopCount - x); + + /* Perform unsigned to signed conversion and rounding off operation */ + vecValue1 = IVP_AVGRU2NX8(vecValue1, 0); + vecValue2 = IVP_AVGRU2NX8(vecValue2, 0); + vecValue3 = IVP_AVGRU2NX8(vecValue3, 0); + vecValue4 = IVP_AVGRU2NX8(vecValue4, 0); + + /* Perform saturation of signed max value */ + vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1); + vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2); + vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3); + vecValueSigned4 = IVP_MINU2NX8U(signedCharMax, vecValue4); + + /* Variable stores */ + IVP_SAV2NX8_XP(vecValueSigned1, vaOutData, pdvecOut, + maxLoopCount - (x + vectorizationWidth3X)); + IVP_SAV2NX8_XP(vecValueSigned2, vaOutData, pdvecOut, + maxLoopCount - (x + vectorizationWidth2X)); + IVP_SAV2NX8_XP(vecValueSigned3, vaOutData, pdvecOut, maxLoopCount - (x + vectorizationWidth)); + IVP_SAV2NX8_XP(vecValueSigned4, vaOutData, pdvecOut, maxLoopCount - x); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* Loop across dim3 */ + { + x = 0; + /* Loop across dimension 1 */ + /* Condition check added to maximize vectorization across dimension 1*/ + /* Loop across dim1 */ + for (; x < (dim1Size - 3 * vectorizationWidth); x += 4 * vectorizationWidth) + { + /* initialize input and output data pointer */ + uint8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1Size - (x + 3 * vectorizationWidth); + + for (y = 0; y < dim2Size; y++) + { + pdvecIn = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8U_PP(pdvecIn); + + /* Load Input Data */ + IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn); + IVP_LA2NX8U_IP(vecValue2, vaInData, pdvecIn); + IVP_LA2NX8U_IP(vecValue3, vaInData, pdvecIn); + IVP_LAV2NX8U_XP(vecValue4, vaInData, pdvecIn, varLen); + + /* Perform unsigned to signed conversion and rounding off operation */ + vecValue1 = IVP_AVGRU2NX8(vecValue1, 0); + vecValue2 = IVP_AVGRU2NX8(vecValue2, 0); + vecValue3 = IVP_AVGRU2NX8(vecValue3, 0); + vecValue4 = IVP_AVGRU2NX8(vecValue4, 0); + + /* Perform saturation of signed max value */ + vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1); + vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2); + vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3); + vecValueSigned4 = IVP_MINU2NX8U(signedCharMax, vecValue4); + + /* Store */ + IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut); + IVP_SA2NX8_IP(vecValueSigned2, vaOutData, pdvecOut); + IVP_SA2NX8_IP(vecValueSigned3, vaOutData, pdvecOut); + IVP_SAV2NX8_XP(vecValueSigned4, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + if (x < (dim1Size - 2 * vectorizationWidth)) /* Loop unrolling across dim2 */ + { + /* initialize input and output data pointer */ + uint8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1Size - (x + 2 * vectorizationWidth); + + for (y = 0; y < dim2Size; y++) + { + pdvecIn = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8U_PP(pdvecIn); + + /* Load Input Data */ + IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn); + IVP_LA2NX8U_IP(vecValue2, vaInData, pdvecIn); + IVP_LAV2NX8U_XP(vecValue3, vaInData, pdvecIn, varLen); + + /* Perform unsigned to signed conversion and rounding off operation */ + vecValue1 = IVP_AVGRU2NX8(vecValue1, 0); + vecValue2 = IVP_AVGRU2NX8(vecValue2, 0); + vecValue3 = IVP_AVGRU2NX8(vecValue3, 0); + + /* Perform saturation of signed max value */ + vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1); + vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2); + vecValueSigned3 = IVP_MINU2NX8U(signedCharMax, vecValue3); + + /* Store */ + IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut); + IVP_SA2NX8_IP(vecValueSigned2, vaOutData, pdvecOut); + IVP_SAV2NX8_XP(vecValueSigned3, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* initialize input and output data pointer */ + uint8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) + { + pdvecIn = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8U_PP(pdvecIn); + + /* Load Input Data */ + IVP_LA2NX8U_IP(vecValue1, vaInData, pdvecIn); + IVP_LAV2NX8U_XP(vecValue2, vaInData, pdvecIn, varLen); + + /* Perform unsigned to signed conversion and rounding off operation */ + vecValue1 = IVP_AVGRU2NX8(vecValue1, 0); + vecValue2 = IVP_AVGRU2NX8(vecValue2, 0); + + /* Perform saturation of signed max value */ + vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1); + vecValueSigned2 = IVP_MINU2NX8U(signedCharMax, vecValue2); + + /* Store */ + IVP_SA2NX8_IP(vecValueSigned1, vaOutData, pdvecOut); + IVP_SAV2NX8_XP(vecValueSigned2, vaOutData, pdvecOut, dim1Size - (x + vectorizationWidth)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + else if (x < dim1Size) + { + /* initialize input and output data pointer */ + uint8_t *pInput1 = pInput + x + (z * inDataPitch2); + int8_t *pOutput1 = pOutput + x + (z * outDataPitch2); + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) + { + pdvecIn = (xb_vec2Nx8U *) (pInput1 + (y * inDataPitch1)); + pdvecOut = (xb_vec2Nx8 *) (pOutput1 + (y * outDataPitch1)); + vaInData = IVP_LA2NX8U_PP(pdvecIn); + + /* Load Input Data */ + IVP_LAV2NX8U_XP(vecValue1, vaInData, pdvecIn, varLen); + + /* Perform unsigned to signed conversion and rounding off operation */ + vecValue1 = IVP_AVGRU2NX8(vecValue1, 0); + + /* Perform saturation of signed max value */ + vecValueSigned1 = IVP_MINU2NX8U(signedCharMax, vecValue1); + + /* Store */ + IVP_SAV2NX8_XP(vecValueSigned1, vaOutData, pdvecOut, varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S8S16 ************************/ +/* Description : P6 implementation for conversion from S8 to S16 */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/***********************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_S8S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_TILE3D_S16(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8 * restrict pvecIn; + xb_vecNx16 * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to S16 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S8 bit */ + /* S16 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16 vecInData; + xb_vecNx16 vecOut; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SANX16_IP(vecOut, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varlen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift); + + vecOut3 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SANX16_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + + /* Store output data */ + IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U8S8 ***********************/ +/* Description : P6 implementation for conversion from U8 to S8 */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 8bit */ +/*********************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_U8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_TILE3D_S8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8U * restrict pvecIn; + xb_vecNx8 * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + /********************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from U8 bit to S8 bit need to done is present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U8 bit */ + /* S8 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /********************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + /*Input and output vectors*/ + xb_vecNx16U vecInData; + xb_vecNx16 vecOut; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8 *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to SCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SANX8S_IP(vecOut, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to SCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, varlen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to SCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift); + + vecOut3 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3), shift); + + /* Store output data */ + IVP_SANX8S_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8S_IP(vecOut1, vaOut, pvecOut); + IVP_SANX8S_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX8S_XP(vecOut3, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to SCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift); + + /* Store output data */ + IVP_SANX8S_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8S_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX8S_XP(vecOut2, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to SCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + /* Store output data */ + IVP_SANX8S_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX8S_XP(vecOut1, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to shift is applied and data is truncated + * in the 8 bit range 0 to SCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + /* Store output data */ + IVP_SAVNX8S_XP(vecOut0, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U8S16 ***********************/ +/* Description : P6 implementation for conversion from U8 to S16 */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 8bit */ +/**********************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_U8S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_TILE3D_S16(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8U * restrict pvecIn; + xb_vecNx16 * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from U8 bit to S16 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U8 bit */ + /* S16 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16U vecInData; + xb_vecNx16 vecOut; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SANX16_IP(vecOut, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varlen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift); + + vecOut3 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SANX16_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + /* Store output data */ + IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U8U16 ***********************/ +/* Description : P6 implementation for conversion from U8 to U16 */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 8bit */ +/**********************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_U8U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_TILE3D_U16(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint16_t *pOutput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8U * restrict pvecIn; + xb_vecNx16U * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from U8 bit to U16 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U8 bit */ + /* U16 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16U vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx16U *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecN_2x32v hvecEven = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift); + xb_vecN_2x32v hvecOdd = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift); + xb_vecNx16U vecOut = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd, USHRT_MAX)), \ + IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN); + + /* store output data */ + IVP_SANX16U_IP(vecOut, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecN_2x32v hvecEven = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift); + xb_vecN_2x32v hvecOdd = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData), shift); + xb_vecNx16U vecOut = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd, USHRT_MAX)), \ + IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN); + + /* store output data */ + IVP_SAVNX16U_XP(vecOut, vaOut, pvecOut, (varlen << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + + for (; x < (dim1Size - vectorizationWidth2X); x += vectorizationWidth2X) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + uint16_t *pOut = &pOutput[z * outTilePitch2 + x]; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecN_2x32v hvecEven0 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift); + xb_vecN_2x32v hvecOdd0 = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift); + xb_vecNx16U vecOut0 = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd0, USHRT_MAX)), \ + IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven0, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN); + + xb_vecN_2x32v hvecEven1 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift); + xb_vecN_2x32v hvecOdd1 = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift); + xb_vecNx16U vecOut1 = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd1, USHRT_MAX)), \ + IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven1, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN); + + /* Store output data */ + IVP_SANX16U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16U_IP(vecOut1, vaOut, pvecOut); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + uint16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LAVNX8U_XP(vecInData1, vaInData, pvecIn, varLen - vectorizationWidth); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecN_2x32v hvecEven0 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift); + xb_vecN_2x32v hvecOdd0 = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData0), shift); + xb_vecNx16U vecOut0 = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd0, USHRT_MAX)), \ + IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven0, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN); + + xb_vecN_2x32v hvecEven1 = IVP_PACKVRNX48_0(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift); + xb_vecN_2x32v hvecOdd1 = IVP_PACKVRNX48_1(IVP_MULUUNX16U((xb_vecNx16U) scale, vecInData1), shift); + xb_vecNx16U vecOut1 = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecOdd1, USHRT_MAX)), \ + IVP_MOVNX16_FROMN_2X32U(IVP_MINN_2X32(hvecEven1, USHRT_MAX)), IVP_SELI_INTERLEAVE_1_EVEN); + + /* Store output data */ + IVP_SAVNX16U_XP(vecOut0, vaOut, pvecOut, (varLen << 1)); + IVP_SAVNX16U_XP(vecOut1, vaOut, pvecOut, ((varLen - vectorizationWidth) << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S8U8 ***********************/ +/* Description : P6 implementation for conversion from S8 to U8 */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/**********************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_S8U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_TILE3D_U8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int16_t minLim = 0; + const int16_t maxLim = UCHAR_MAX; + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint8_t *pOutput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8 * restrict pvecIn; + xb_vecNx8U * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16U vecOut0, vecOut1, vecOut2, vecOut3; + + /********************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to U8 bit need to done is present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U8 bit */ + /* S8 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /********************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + /*Input and Output vectors*/ + xb_vecNx16 vecInData; + xb_vecNx16 vecOut; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to UCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift); + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SANX8U_IP(vecOut, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to UCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift); + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, varlen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + uint8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to UCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut3 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3), shift); + vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + uint8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to UCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + uint8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied and data is truncated + * in the 8 bit range 0 to UCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + uint8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to shift is applied and data is truncated + * in the 8 bit range 0 to UCHAR_MAX. So the final result + * is 32-way, 8-bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S16 *****************************/ +/* Description : P6 implementation for conversion S16 to S16 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 16bit */ +/**************************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_S16(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16 * restrict pvecIn; + xb_vecNx16 * restrict pvecOut; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 16 bit to S16 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 16 bit to S16 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16 vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + xb_vecNx16 vecOut; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SANX16_IP(vecOut, vaOut, pvecOut); + } + int32_t varLen = (maxLoopCount - x); + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift); + vecOut3 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SANX16_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2; + xb_vecNx16 vecOut0, vecOut1, vecOut2; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut2 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut1 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0; + xb_vecNx16 vecOut0; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0), shift); + + /* Store output data */ + IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S16I32 *****************************/ +/* Description : P6 implementation for conversion S16 to I32 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 16bit */ +/**************************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_S16I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_I32(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? INT_MIN : 0; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16 * restrict pvecIn; + xb_vecN_2x32v * restrict pvecOut; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 16 bit to I32 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 16 bit to I32 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16 vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + xb_vecN_2x32v vecOutL, vecOutH; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + vecOutL = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim); + + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + vecOutH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim); + /* store output data */ + IVP_SAN_2X32_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOutH, vaOut, pvecOut); + } + int32_t varLen = (maxLoopCount - x); + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + vecOutL = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim); + + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + vecOutH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim); + + /* store output data */ + IVP_SAVN_2X32_XP(vecOutL, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOutH, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16_IP(vecInData3, vaInData, pvecIn); + + + xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecOutIntm2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecOutIntm3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecOutIntm4 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2)); + vecOut1L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2)); + vecOut1H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3)); + vecOut2L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3)); + vecOut2H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm4), IVP_CVT64SNX48LL(vecOutIntm4)); + vecOut3L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut3L = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm4), IVP_CVT64SNX48HL(vecOutIntm4)); + vecOut3H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut3H = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecOutIntm2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecOutIntm3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2)); + vecOut1L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2)); + vecOut1H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3)); + vecOut2L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3)); + vecOut2H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + + xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecOutIntm2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2)); + vecOut1L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2)); + vecOut1H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0; + xb_vecN_2x32v vecOut0L, vecOut0H; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + vecOut0L = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + vecOut0H = IVP_PACKVRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U16I32 *****************************/ +/* Description : P6 implementation for conversion U16 to I32 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is un-signed 16bit */ +/**************************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_U16I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U16(inTile); + XAI_CHECK_TILE3D_I32(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + const uint32_t rndVal = (1 << (shift - 1)); + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? 0 : INT_MIN; + /******************************************************************************************************/ + /*usage of minLim */ + /*U16 x U16 = U32 - result is in U32. We have two output variants S32 and U32 */ + /*For S32 output we need to clamp i.e.,(MIN(res,INT_MAX)) result using S32_MAX */ + /*For U32 output we need to clamp i.e., (MIN(res,UINT_MAX)) result using U32_MAX */ + /*PACK ISA available (IVP_PACKVRN_2X64W) will clamp the result to S32 range only */ + /*one option to implement this is to write two APIs with change only in clamping operation - */ + /*Note : we don't prefer using an if inside loop */ + /*To avoid above condition below code uses a hack - Final res is in S32 container so - */ + /* U32 to S32 can be done by MAX(0,res) and U32 to U32 can be done by MAX(INT_MIN,res) */ + /* MAX(0,res) will work because all values above S32_MAX will be interpretted as < 0 in S32 container */ + /******************************************************************************************************/ + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16U * restrict pvecIn; + xb_vecN_2x32v * restrict pvecOut; + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from unsigned 16 bit to I32 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from unsigned */ + /* 16 bit to I32 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16U vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + xb_vecN_2x32v vecOutL, vecOutH; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOutL = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift); + vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim); + + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOutH = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim); + /* store output data */ + IVP_SAN_2X32_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOutH, vaOut, pvecOut); + } + int32_t varLen = (maxLoopCount - x); + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOutL = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift); + vecOutL = IVP_MAXN_2X32(vecOutL, (xb_vecN_2x32v) minLim); + + + vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOutH = IVP_PACKVRNRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MAXN_2X32(vecOutH, (xb_vecN_2x32v) minLim); + + /* store output data */ + IVP_SAVN_2X32_XP(vecOutL, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOutH, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16_IP(vecInData3, vaInData, pvecIn); + + + xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecOutIntm2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecOutIntm3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecOutIntm4 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut1L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut1H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut2L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut2H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm4), IVP_CVT64SNX48LL(vecOutIntm4)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut3L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut3L = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm4), IVP_CVT64SNX48HL(vecOutIntm4)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut3H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut3H = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecOutIntm2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecOutIntm3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut1L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut1H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm3), IVP_CVT64SNX48LL(vecOutIntm3)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut2L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm3), IVP_CVT64SNX48HL(vecOutIntm3)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut2H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + + xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecOutIntm2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm2), IVP_CVT64SNX48LL(vecOutIntm2)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut1L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm2), IVP_CVT64SNX48HL(vecOutIntm2)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut1H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0; + xb_vecN_2x32v vecOut0L, vecOut0H; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 vecOutIntm1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + + xb_vecN_2x64w vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecOutIntm1), IVP_CVT64SNX48LL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0L = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOutIntm = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecOutIntm1), IVP_CVT64SNX48HL(vecOutIntm1)); + IVP_MULUUAN_2X16X32_0(vecOutIntm, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); + vecOut0H = IVP_PACKVRNRN_2X64W(vecOutIntm, shift); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + /* Store output data */ + IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, (varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1)); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U16S16 **************************/ +/* Description : P6 implementation for conversion U16 to S16 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 16bit */ +/**************************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_U16S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U16(inTile); + XAI_CHECK_TILE3D_S16(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16U * restrict pvecIn; + xb_vecNx16 * restrict pvecOut; + + /********************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from unsigned 16 bit to S16 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from unsigned */ + /* 16 bit to S16 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /********************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16U vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx16 *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + xb_vecNx16 vecOut; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SANX16_IP(vecOut, vaOut, pvecOut); + } + int32_t varLen = (maxLoopCount - x); + /* load input data */ + IVP_LANX16U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData), shift); + + /* store output data */ + IVP_SAVNX16_XP(vecOut, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift); + vecOut3 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SANX16_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1, vecInData2; + xb_vecNx16 vecOut0, vecOut1, vecOut2; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + vecOut2 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + vecOut1 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1), shift); + + /* Store output data */ + IVP_SANX16_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0; + xb_vecNx16 vecOut0; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16 *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + vecOut0 = IVP_PACKVRNX48(IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0), shift); + + /* Store output data */ + IVP_SAVNX16_XP(vecOut0, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S16U16 **************************/ +/* Description : P6 implementation for conversion S16 to U16 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 16bit */ +/**************************************************************************/ + +XAI_ERR_TYPE xaiDataConversion3D_S16U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_U16(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint16_t *pOutput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16 * restrict pvecIn; + xb_vecNx16U * restrict pvecOut; + + /********************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from unsigned 16 bit to S16 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from unsigned */ + /* 16 bit to S16 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /********************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16 vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx16U *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + xb_vecNx16 vecOut; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + PACK_ROUND_U16(vecOut, vecInData, scale, shift); + /* store output data */ + IVP_SANX16U_IP(vecOut, vaOut, pvecOut); + } + int32_t varLen = (maxLoopCount - x); + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + PACK_ROUND_U16(vecOut, vecInData, scale, shift); + /* store output data */ + IVP_SAVNX16U_XP(vecOut, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + uint16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16U vecOut0, vecOut1, vecOut2, vecOut3; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + + PACK_ROUND_U16(vecOut0, vecInData0, scale, shift); + PACK_ROUND_U16(vecOut1, vecInData1, scale, shift); + PACK_ROUND_U16(vecOut2, vecInData2, scale, shift); + PACK_ROUND_U16(vecOut3, vecInData3, scale, shift); + + /* Store output data */ + IVP_SANX16U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16U_IP(vecOut1, vaOut, pvecOut); + IVP_SANX16U_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX16U_XP(vecOut3, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + uint16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2; + xb_vecNx16U vecOut0, vecOut1, vecOut2; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + PACK_ROUND_U16(vecOut0, vecInData0, scale, shift); + PACK_ROUND_U16(vecOut1, vecInData1, scale, shift); + PACK_ROUND_U16(vecOut2, vecInData2, scale, shift); + + /* Store output data */ + IVP_SANX16U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX16U_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX16U_XP(vecOut2, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + uint16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1; + xb_vecNx16U vecOut0, vecOut1; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + PACK_ROUND_U16(vecOut0, vecInData0, scale, shift); + PACK_ROUND_U16(vecOut1, vecInData1, scale, shift); + + /* Store output data */ + IVP_SANX16U_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX16U_XP(vecOut1, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + uint16_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0; + xb_vecNx16U vecOut0; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx16U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + PACK_ROUND_U16(vecOut0, vecInData0, scale, shift); + /* Store output data */ + IVP_SAVNX16U_XP(vecOut0, vaOut, pvecOut, (varLen << 1)); + IVP_SAPOSNX16U_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S8I64 *****************************/ +/* Description : P6 implementation for conversion S8 to I64 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_S8I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_TILE3D_I64(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + valign vaOut = IVP_ZALIGN(); + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) ? INT_MIN : 0; + //S16 x U16 = S32 , rounded and shifted back to S32. + //even though the output type is S64. Data is within S32 range so INT_MIN is sufficient. + int32_t x, y, z; + xb_vecNx8 *restrict pvecIn; + xb_vecN_2x64w *restrict pvecOut; + + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to I64 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S8 bit */ + /* I64 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16 vecInData; + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + xb_vecN_2x32v vecOutTempL, vecOutTempH; + xb_vecN_2x64w vecOutL, vecOutH; + pvecIn = (xb_vecNx8 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + int32_t varlen; + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + xb_vecNx48 vecIntRes = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut); + } + + varlen = (maxLoopCount - x); + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + xb_vecNx48 vecIntRes = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3)); + IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else + { + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + xb_vecN_2x32v vecOutTempL, vecOutTempH; + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData3, vaInData, pvecIn); + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecIntRes3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3); + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3)); + vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut3L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2); + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U8I64 *****************************/ +/* Description : P6 implementation for conversion U8 to I64 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 8bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_U8I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_TILE3D_I64(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8U *restrict pvecIn; + xb_vecN_2x64w *restrict pvecOut; + + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from U8 bit to I64 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U8 bit */ + /* I64 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16U vecInData; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + xb_vecN_2x32v vecOutTempL, vecOutTempH; + xb_vecN_2x64w vecOutL, vecOutH; + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + //sign extending to 64bit + vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + //sign extending to 64bit + vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + //sign extending to 64bit + vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* store output data */ + IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3)); + IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + xb_vecN_2x32v vecOutTempL, vecOutTempH; + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData3, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecIntRes3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3)); + vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + //sign extending to 64bit + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + //sign extending to 64bit + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift); + //sign extending to 64bit + vecOut3L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift); + //sign extending to 64bit + vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + //sign extending to 64bit + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + //sign extending to 64bit + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x;; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_S16I64 *****************************/ +/* Description : P6 implementation for conversion S16 to I64 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 16bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_S16I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_I64(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) ? INT_MIN : 0; + //S16 x U16 = S32 , rounded and shifted back to S32. + //even though the output type is S64. Data is within S32 range so INT_MIN is sufficient. + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16 *restrict pvecIn; + xb_vecN_2x64w *restrict pvecOut; + + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S16 bit to I64 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S16 bit */ + /* I64 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16 vecInData; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + xb_vecN_2x32v vecOutTempL, vecOutTempH; + xb_vecN_2x64w vecOutL, vecOutH; + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX16_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + + //sign extending to 64bit + vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOutL = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + vecOutH = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* store output data */ + IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3)); + IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + xb_vecN_2x32v vecOutTempL, vecOutTempH; + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16_IP(vecInData3, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecIntRes3 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData3); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3)); + vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut3L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData2); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData1); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUSNX16((xb_vecNx16U) scale, vecInData0); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_U16I64 *****************************/ +/* Description : P6 implementation for conversion U16 to I64 */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 16bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_U16I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U16(inTile); + XAI_CHECK_TILE3D_I64(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + const uint32_t rndVal = (1 << (shift - 1)); + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16U *restrict pvecIn; + xb_vecN_2x64w *restrict pvecOut; + + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from U16 bit to I64 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U16 bit */ + /* I64 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16U vecInData; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + xb_vecN_2x32Uv vecOutTempL, vecOutTempH; + xb_vecN_2x64w vecOutL, vecOutH; + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX16U_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + IVP_MULUUAN_2X16X32_0(vecOutIntm1, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm1, shift)); + //sign extending to 64bit + vecOutL = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm2, shift)); + //sign extending to 64bit + vecOutH = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_SAN_2X64W_IP(vecOutL, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOutH, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 vecIntRes = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData); + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes), IVP_CVT64SNX48LL(vecIntRes)); + IVP_MULUUAN_2X16X32_0(vecOutIntm1, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm1, shift)); + //sign extending to 64bit + vecOutL = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes), IVP_CVT64SNX48HL(vecIntRes)); + IVP_MULUUAN_2X16X32_0(vecOutIntm2, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOutIntm2, shift)); + //sign extending to 64bit + vecOutH = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* store output data */ + IVP_SAVN_2X64W_XP(vecOutL, vaOut, pvecOut, (varlen << 3)); + IVP_SAVN_2X64W_XP(vecOutH, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + xb_vecN_2x32Uv vecOutTempL, vecOutTempH; + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData3, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2); + xb_vecNx48 vecIntRes3 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData3); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes3), IVP_CVT64SNX48LL(vecIntRes3)); + vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes3), IVP_CVT64SNX48HL(vecIntRes3)); + + IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift)); + //sign extending to 64bit + vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift)); + //sign extending to 64bit + vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_MULUUAN_2X16X32_0(vecOut1L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1L, shift)); + //sign extending to 64bit + vecOut1L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut1H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1H, shift)); + //sign extending to 64bit + vecOut1H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_MULUUAN_2X16X32_0(vecOut2L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2L, shift)); + //sign extending to 64bit + vecOut2L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut2H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2H, shift)); + //sign extending to 64bit + vecOut2H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_MULUUAN_2X16X32_0(vecOut3L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut3L, shift)); + //sign extending to 64bit + vecOut3L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut3H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut3H, shift)); + //sign extending to 64bit + vecOut3H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + xb_vecNx48 vecIntRes2 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData2); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes2), IVP_CVT64SNX48LL(vecIntRes2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes2), IVP_CVT64SNX48HL(vecIntRes2)); + + IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift)); + //sign extending to 64bit + vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift)); + //sign extending to 64bit + vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_MULUUAN_2X16X32_0(vecOut1L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1L, shift)); + //sign extending to 64bit + vecOut1L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut1H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1H, shift)); + //sign extending to 64bit + vecOut1H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_MULUUAN_2X16X32_0(vecOut2L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2L, shift)); + //sign extending to 64bit + vecOut2L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut2H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut2H, shift)); + //sign extending to 64bit + vecOut2H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + xb_vecNx48 vecIntRes1 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData1); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes1), IVP_CVT64SNX48LL(vecIntRes1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes1), IVP_CVT64SNX48HL(vecIntRes1)); + + + IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift)); + //sign extending to 64bit + vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift)); + //sign extending to 64bit + vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + IVP_MULUUAN_2X16X32_0(vecOut1L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1L, shift)); + //sign extending to 64bit + vecOut1L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut1H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut1H, shift)); + //sign extending to 64bit + vecOut1H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 vecIntRes0 = IVP_MULUUNX16((xb_vecNx16U) scale, vecInData0); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(vecIntRes0), IVP_CVT64SNX48LL(vecIntRes0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(vecIntRes0), IVP_CVT64SNX48HL(vecIntRes0)); + + IVP_MULUUAN_2X16X32_0(vecOut0L, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempL = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0L, shift)); + //sign extending to 64bit + vecOut0L = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + IVP_MULUUAN_2X16X32_0(vecOut0H, (xb_vecNx16U) 1, (xb_vecN_2x32Uv) rndVal); //rounding + vecOutTempH = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_PACKVRNRN_2X64W(vecOut0H, shift)); + //sign extending to 64bit + vecOut0H = IVP_MULUUN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D ****************************************/ +/* Description : General API for DataConversion3D optimized implementation */ +/* Calls one of the DataConversion3D functions based */ +/* on the parameters */ +/* Inputs : Input Tile, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/*********************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift) +{ + if ((!inTile) || (!outTile)) + { + return(XAI_ERR_NULLARG); + } + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U16)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + return(xaiDataConversion3D_U16S16(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)) + { + return(xaiDataConversion3D_U16I32(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)) + { + return(xaiDataConversion3D_U16I64(inTile, outTile, scale, shift)); + } + else + { + return(xaiDataConversion3D_U16I8(inTile, outTile, scale, shift)); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + return(xaiDataConversion3D_S16(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + return(xaiDataConversion3D_S16U16(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)) + { + return(xaiDataConversion3D_S16I32(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)) + { + return(xaiDataConversion3D_S16I64(inTile, outTile, scale, shift)); + } + else + { + return(xaiDataConversion3D_S16I8(inTile, outTile, scale, shift)); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) + { + return(xaiDataConversion3D_S8U8(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + return(xaiDataConversion3D_S8S16(inTile, outTile, scale, shift)); + } + else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))) + { + return(xaiDataConversion3D_S8I32(inTile, outTile, scale, shift)); + } + else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))) + { + return(xaiDataConversion3D_S8I64(inTile, outTile, scale, shift)); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + return(xaiDataConversion3D_U8S8(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + return(xaiDataConversion3D_U8S16(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + return(xaiDataConversion3D_U8U16(inTile, outTile, scale, shift)); + } + else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))) + { + return(xaiDataConversion3D_U8I32(inTile, outTile, scale, shift)); + } + else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))) + { + return(xaiDataConversion3D_U8I64(inTile, outTile, scale, shift)); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S32)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + return(xaiDataConversion3D_S32S8(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) + { + return(xaiDataConversion3D_S32U8(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + return(xaiDataConversion3D_S32S16(inTile, outTile, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + return(xaiDataConversion3D_S32U16(inTile, outTile, scale, shift)); + } + } + return(XAI_ERR_NO_VARIANT); +} + +/********************* xaiDataConversion3D_AsymQ_U8S8 ********************/ +/* Description : P6 implementation for conversion from U8_SYM to S8_ASYM */ +/* Inputs : Input Tile, zeroOut, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 8bit */ +/*************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_TILE3D_S8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \ + "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + uint8_t *pInput = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8U * restrict pvecIn; + xb_vecNx8 * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + int32_t zeroOutParam = zeroOut; + xb_vecNx48 zeroOutShift = IVP_CVT48SNX32((xb_vecN_2x32v) (zeroOutParam << shift), (xb_vecN_2x32v) (zeroOutParam << shift)); + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + + /********************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from U8 bit to S8 bit need to done is present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from U8 bit */ + /* S8 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /********************************************************************************/ + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + /*Input and output vectors*/ + xb_vecNx16U vecInData; + xb_vecNx16 vecOut; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8 *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8U_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* add zeroOut, apply scale and shift to input data. + * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data, + * then shift is applied and data is truncated in the 8 bit range + * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit. + */ + xb_vecNx48 acc = zeroOutShift; + IVP_MULUUANX16(acc, vecInData, vecScale); + vecOut = IVP_PACKVRNX48(acc, shift); + + /* store output data */ + IVP_SANX8S_IP(vecOut, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8U_IP(vecInData, vaInData, pvecIn); + + /* add zeroOut, apply scale and shift to input data. + * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data, + * then shift is applied and data is truncated in the 8 bit range + * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit. + */ + xb_vecNx48 acc = zeroOutShift; + IVP_MULUUANX16(acc, vecInData, vecScale); + vecOut = IVP_PACKVRNX48(acc, shift); + + /* store output data */ + IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, varlen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData3, vaInData, pvecIn); + + /* add zeroOut, apply scale and shift to input data. + * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data, + * then shift is applied and data is truncated in the 8 bit range + * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit. + */ + xb_vecNx48 acc0, acc1, acc2, acc3; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + acc2 = zeroOutShift; + acc3 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecInData0, vecScale); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + + IVP_MULUUANX16(acc1, vecInData1, vecScale); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + + IVP_MULUUANX16(acc2, vecInData2, vecScale); + vecOut2 = IVP_PACKVRNX48(acc2, shift); + + IVP_MULUUANX16(acc3, vecInData3, vecScale); + vecOut3 = IVP_PACKVRNX48(acc3, shift); + + /* Store output data */ + IVP_SANX8S_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8S_IP(vecOut1, vaOut, pvecOut); + IVP_SANX8S_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX8S_XP(vecOut3, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData2, vaInData, pvecIn); + + /* add zeroOut, apply scale and shift to input data. + * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data, + * then shift is applied and data is truncated in the 8 bit range + * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit. + */ + xb_vecNx48 acc0, acc1, acc2; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + acc2 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecInData0, vecScale); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + + IVP_MULUUANX16(acc1, vecInData1, vecScale); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + + IVP_MULUUANX16(acc2, vecInData2, vecScale); + vecOut2 = IVP_PACKVRNX48(acc2, shift); + + /* Store output data */ + IVP_SANX8S_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8S_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX8S_XP(vecOut2, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8U_IP(vecInData1, vaInData, pvecIn); + + /* add zeroOut, apply scale and shift to input data. + * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data, + * then shift is applied and data is truncated in the 8 bit range + * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit. + */ + + xb_vecNx48 acc0, acc1; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecInData0, vecScale); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + + IVP_MULUUANX16(acc1, vecInData1, vecScale); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + + /* Store output data */ + IVP_SANX8S_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX8S_XP(vecOut1, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint8_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8U_PP(pvecIn); + + /* load input data */ + IVP_LANX8U_IP(vecInData0, vaInData, pvecIn); + + /* add zeroOut, apply scale and shift to input data. + * To 48bit shifted zeroOut values, add scaled input which is 32 way 48-bit data, + * then shift is applied and data is truncated in the 8 bit range + * SCHAR_MIN to SCHAR_MAX. So the final result is 32-way, 8-bit. + */ + xb_vecNx48 acc0; + acc0 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecInData0, vecScale); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + + /* Store output data */ + IVP_SAVNX8S_XP(vecOut0, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_AsymQ_S16S8 ********************/ +/* Description : P6 implementation for conversion from S16_SYM to S8_ASYM */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, zeroOut, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 16bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S16S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_S8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \ + "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int16_t minLim = SCHAR_MIN; + const int16_t maxLim = SCHAR_MAX; + + /* Get Data Pointers */ + int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16 * restrict pvecIn; + xb_vecNx8U * restrict pvecOut; + + int64_t zerOutShifted = (int64_t) zeroOut << shift; + xb_vecN_2x32v hvecZeroL = (xb_vecN_2x32v) ((int32_t) (zerOutShifted & 0xFFFFFFFF)); + xb_vecN_2x32v hvecZeroH = (xb_vecN_2x32v) ((int32_t) ((zerOutShifted >> 32) & 0xFFFFFFFF)); + xb_vec2Nx8 dvecZeroSh = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32 \ + (IVP_SELN_2X32I(hvecZeroH, hvecZeroL, IVP_SELI_32B_INTERLEAVE_1_LO))); + xb_vecNx48 zeroOutShift = IVP_CVT48UN_2X64L(dvecZeroSh, dvecZeroSh); + IVP_CVT48UN_2X64H(zeroOutShift, dvecZeroSh, dvecZeroSh); + + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 16 bit to S8/U8 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 16 bit to S8/U8 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16 vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16_PP(pvecIn); + xb_vecNx16 vecOut; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc = zeroOutShift; + IVP_MULUSANX16(acc, vecScale, vecInData); + vecOut = IVP_PACKVRNX48(acc, shift); + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SANX8U_IP(vecOut, vaOut, pvecOut); + } + /* load input data */ + IVP_LANX16_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc = zeroOutShift; + IVP_MULUSANX16(acc, vecScale, vecInData); + vecOut = IVP_PACKVRNX48(acc, shift); + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x)); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1, acc2, acc3; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + acc2 = zeroOutShift; + acc3 = zeroOutShift; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUSANX16(acc1, vecScale, vecInData1); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUSANX16(acc2, vecScale, vecInData2); + vecOut2 = IVP_PACKVRNX48(acc2, shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUSANX16(acc3, vecScale, vecInData3); + vecOut3 = IVP_PACKVRNX48(acc3, shift); + vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2; + xb_vecNx16 vecOut0, vecOut1, vecOut2; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1, acc2; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + acc2 = zeroOutShift; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUSANX16(acc1, vecScale, vecInData1); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUSANX16(acc2, vecScale, vecInData2); + vecOut2 = IVP_PACKVRNX48(acc2, shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUSANX16(acc1, vecScale, vecInData1); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16 vecInData0; + xb_vecNx16 vecOut0; + + pvecIn = (xb_vecNx16 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16_PP(pvecIn); + + /* load input data */ + IVP_LANX16_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0 = zeroOutShift; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************** xaiDataConversion3D_AsymQ_U16S8 *******************/ +/* Description : P6 implementation for conversion from U16_SYM to S8_ASYM */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, zeroOut, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is unsigned 16bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U16S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U16(inTile); + XAI_CHECK_TILE3D_S8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \ + "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int16_t minLim = SCHAR_MIN; + const int16_t maxLim = SCHAR_MAX; + + /* Get Data Pointers */ + uint16_t *pInput = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx16U * restrict pvecIn; + xb_vecNx8U * restrict pvecOut; + + int64_t zerOutShifted = (int64_t) zeroOut << shift; + xb_vecN_2x32v hvecZeroL = (xb_vecN_2x32v) ((int32_t) (zerOutShifted & 0xFFFFFFFF)); + xb_vecN_2x32v hvecZeroH = (xb_vecN_2x32v) ((int32_t) ((zerOutShifted >> 32) & 0xFFFFFFFF)); + xb_vec2Nx8 dvecZeroSh = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32 \ + (IVP_SELN_2X32I(hvecZeroH, hvecZeroL, IVP_SELI_32B_INTERLEAVE_1_LO))); + xb_vecNx48 zeroOutShift = IVP_CVT48UN_2X64L(dvecZeroSh, dvecZeroSh); + IVP_CVT48UN_2X64H(zeroOutShift, dvecZeroSh, dvecZeroSh); + + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 16 bit to S8/U8 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 16 bit to S8/U8 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecNx16U vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx16U *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8U *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LANX16U_PP(pvecIn); + xb_vecNx16 vecOut; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* load input data */ + IVP_LANX16U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc = zeroOutShift; + IVP_MULUUANX16(acc, vecScale, vecInData); + vecOut = IVP_PACKVRNX48(acc, shift); + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SANX8U_IP(vecOut, vaOut, pvecOut); + } + /* load input data */ + IVP_LANX16U_IP(vecInData, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc = zeroOutShift; + IVP_MULUUANX16(acc, vecScale, vecInData); + vecOut = IVP_PACKVRNX48(acc, shift); + vecOut = IVP_MAXNX16(IVP_MINNX16(vecOut, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* store output data */ + IVP_SAVNX8U_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x)); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecNx16 vecOut0, vecOut1, vecOut2, vecOut3; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData2, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1, acc2, acc3; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + acc2 = zeroOutShift; + acc3 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUUANX16(acc1, vecScale, vecInData1); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUUANX16(acc2, vecScale, vecInData2); + vecOut2 = IVP_PACKVRNX48(acc2, shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUUANX16(acc3, vecScale, vecInData3); + vecOut3 = IVP_PACKVRNX48(acc3, shift); + vecOut3 = IVP_MAXNX16(IVP_MINNX16(vecOut3, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut2, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut3, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1, vecInData2; + xb_vecNx16 vecOut0, vecOut1, vecOut2; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData1, vaInData, pvecIn); + IVP_LANX16U_IP(vecInData2, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1, acc2; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + acc2 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUUANX16(acc1, vecScale, vecInData1); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUUANX16(acc2, vecScale, vecInData2); + vecOut2 = IVP_PACKVRNX48(acc2, shift); + vecOut2 = IVP_MAXNX16(IVP_MINNX16(vecOut2, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SANX8U_IP(vecOut1, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut2, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16U vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + IVP_LANX16_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1; + acc0 = zeroOutShift; + acc1 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + IVP_MULUUANX16(acc1, vecScale, vecInData1); + vecOut1 = IVP_PACKVRNX48(acc1, shift); + vecOut1 = IVP_MAXNX16(IVP_MINNX16(vecOut1, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SANX8U_IP(vecOut0, vaOut, pvecOut); + IVP_SAVNX8U_XP(vecOut1, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + uint16_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecNx16U vecInData0; + xb_vecNx16 vecOut0; + + pvecIn = (xb_vecNx16U *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8U *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX16U_PP(pvecIn); + + /* load input data */ + IVP_LANX16U_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0 = zeroOutShift; + + IVP_MULUUANX16(acc0, vecScale, vecInData0); + vecOut0 = IVP_PACKVRNX48(acc0, shift); + vecOut0 = IVP_MAXNX16(IVP_MINNX16(vecOut0, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Store output data */ + IVP_SAVNX8U_XP(vecOut0, vaOut, pvecOut, varLen); + IVP_SAPOSNX8U_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +// Temporary wrapper, to be removed later +XAI_ERR_TYPE xaiDataConversion3D_U16AS8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift) +{ + return(xaiDataConversion3D_AsymQ_U16S8(inTile, outTile, zeroOut, scale, shift)); +} + +/********************* xaiDataConversion3D_AsymQ_S32S8 ********************/ +/* Description : P6 implementation for conversion from S32_SYM to S8_ASYM */ +/* depending on Output Tile type */ +/* Inputs : Input Tile, zeroOut, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 32bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S32S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S32(inTile); + XAI_CHECK_TILE3D_S8(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 32, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 32", shift); + XAI_CHECK_ERROR((zeroOut >= -128) && (zeroOut < 128), XAI_ERR_NORM, \ + "\nzeroOut = %hi, value must be greater than or equal to -128 and less than 128", zeroOut); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int16_t minLim = SCHAR_MIN; + const int16_t maxLim = SCHAR_MAX; + + /* Get Data Pointers */ + int32_t *pInput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH / 2; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecN_2x32v * restrict pvecIn; + xb_vecNx8 * restrict pvecOut; + + xb_vecN_2x64w vec0scaledIn64B, vec1scaledIn64B; + + /* SCALE*/ + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile width and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from signed 32 bit to S8/U8 bit need to done present in */ + /* in contiguous memory location. Hence vectorization can be utilized */ + /* effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from signed */ + /* 32 bit to S8/U8 bit need to done exist in non-contiguous memory */ + /* location. In order to do vectorization across first dimension, */ + /* output data pointers need to be updated based on output tile size */ + /* and output tile pitch */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input data vectors */ + xb_vecN_2x32v vecInData0, vecInData1; + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecN_2x32v *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecNx8 *) (pOutput + (z * outTilePitch2)); + + valign vaInData = IVP_LAN_2X32_PP(pvecIn); + xb_vecNx16 vecOut, vecOut0, vecOut1; + x = 0; + for (; x < maxLoopCount - vectorizationWidth2X; x += vectorizationWidth2X) + { + /* Load input data */ + IVP_LAN_2X32_IP(vecInData0, vaInData, pvecIn); + IVP_LAN_2X32_IP(vecInData1, vaInData, pvecIn); + + /* Initialize the 64-bit wide vector with (zeroOut << shift)*/ + vec0scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift)); + vec1scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift)); + + /* Multiply U16 scale with S32 input and ACCUMULATE in 64-bit wide vector */ + IVP_MULUSAN_2X16X32_0(vec0scaledIn64B, vecScale, vecInData0); + IVP_MULUSAN_2X16X32_0(vec1scaledIn64B, vecScale, vecInData1); + + /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */ + xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift); + xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift); + + /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it + * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/ + vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + + /* Select the actual data present at even lanes, i.e. 0, 2, 4,... */ + vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0); + + /* Store output data */ + IVP_SANX8S_IP(vecOut, vaOut, pvecOut); + } + + /* Load remaining input data */ + IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, (maxLoopCount - x) * 4); + IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, ((maxLoopCount - x) - (vectorizationWidth >> 1)) * 4); + + /* Initialize the 64-bit wide vector with (zeroOut << shift)*/ + vec0scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift)); + vec1scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift)); + + /* Multiply U16 scale with S32 input and ACCUMULATE in 64-bit wide vector */ + IVP_MULUSAN_2X16X32_0(vec0scaledIn64B, vecScale, vecInData0); + IVP_MULUSAN_2X16X32_0(vec1scaledIn64B, vecScale, vecInData1); + + /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */ + xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift); + xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift); + + /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it + * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/ + vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + + /* Select the actual data present at even lanes, i.e. 0, 2, 4,... */ + vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0); + + /* Store output data */ + IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, (maxLoopCount - x)); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + for (; x < dim1Size; x += vectorizationWidth2X) + { + /* Initialize input and output data pointer */ + int32_t * pIn = &pInput[z * inTilePitch2 + x]; + int8_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + /* input and output data vectors */ + xb_vecN_2x32v vecInData0, vecInData1; + xb_vecNx16 vecOut0, vecOut1, vecOut; + + pvecIn = (xb_vecN_2x32v *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecNx8 *) (pOut + (y * outTilePitch1)); + + /* Load input data */ + valign vaInData = IVP_LAN_2X32_PP(pvecIn); + IVP_LAVN_2X32_XP(vecInData0, vaInData, pvecIn, varLen * 4); + IVP_LAVN_2X32_XP(vecInData1, vaInData, pvecIn, (varLen - (vectorizationWidth >> 1)) * 4); + + /* Initialize the 64-bit wide vector with (zeroOut << shift)*/ + vec0scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift)); + vec1scaledIn64B = IVP_MULSUN_2X16X32_0(zeroOut, (1 << shift)); + + /* Multiply U16 scale with S32 input and ACCUMULATE in 64-bit wide vector */ + IVP_MULUSAN_2X16X32_0(vec0scaledIn64B, vecScale, vecInData0); + IVP_MULUSAN_2X16X32_0(vec1scaledIn64B, vecScale, vecInData1); + + /* Pack the 64-bit wide vector in 32 bit vecotr, by applying shift */ + xb_vecN_2x32v vec0scaledIn32B = IVP_PACKVRN_2X64W(vec0scaledIn64B, shift); + xb_vecN_2x32v vec1scaledIn32B = IVP_PACKVRN_2X64W(vec1scaledIn64B, shift); + + /* CLAMP the 32bit scaled-shift data to minLim and maxLim, & store it + * in 16-bit vector, whose odd lanes (1, 3, 5...) are zeroes.*/ + vecOut0 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec0scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + vecOut1 = IVP_MOVNX16_FROMN_2X32(IVP_MAXN_2X32(IVP_MINN_2X32(vec1scaledIn32B, (xb_vecN_2x32v) maxLim), (xb_vecN_2x32v) minLim)); + + /* Select the actual data present at even lanes, i.e. 0, 2, 4,... */ + vecOut = IVP_SELNX16I(vecOut1, vecOut0, IVP_SELI_EXTRACT_1_OF_2_OFF_0); + + /* Store output data */ + IVP_SAVNX8S_XP(vecOut, vaOut, pvecOut, varLen); + IVP_SAPOSNX8S_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_AsymQ_S8I32 ********************/ +/* Description : P6 implementation for conversion from S8_ASYM to I32_SYM */ +/* Inputs : Input Tile, zeroIn, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroIn, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_TILE3D_I32(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR((zeroIn >= -128) && (zeroIn < 128), XAI_ERR_NORM, \ + "\nzeroIn = %hi, value must be greater than or equal to -128 and less than 128", zeroIn); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) ? INT_MIN : 0; + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8 * restrict pvecIn; + xb_vecN_2x32v * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x32v vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + xb_vecNx16 vecZeroIn = (xb_vecNx16) (-zeroIn); + xb_vecNx48 zeroInScale = IVP_MULUSNX16(vecScale, vecZeroIn); + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to S16 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S8 bit */ + /* S16 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16 vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x32v *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 acc = zeroInScale; + IVP_MULUSANX16(acc, vecScale, vecInData); + //vecOut = IVP_PACKVRNX48(acc, shift); + xb_vecN_2x32v vecIntResL = IVP_CVT32SNX48L(acc); + xb_vecN_2x32v vecIntResH = IVP_CVT32SNX48H(acc); + vecIntResL = IVP_ADDN_2X32(vecIntResL, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntResH = IVP_ADDN_2X32(vecIntResH, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntResL = IVP_SRAN_2X32(vecIntResL, (xb_vecN_2x32v) (shift)); + vecIntResH = IVP_SRAN_2X32(vecIntResH, (xb_vecN_2x32v) (shift)); + vecOut0L = IVP_MAXN_2X32(vecIntResL, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MAXN_2X32(vecIntResH, (xb_vecN_2x32v) minLim); + + + /* store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 acc = zeroInScale; + IVP_MULUSANX16(acc, vecScale, vecInData); + xb_vecN_2x32v vecIntResL = IVP_CVT32SNX48L(acc); + xb_vecN_2x32v vecIntResH = IVP_CVT32SNX48H(acc); + + vecIntResL = IVP_ADDN_2X32(vecIntResL, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntResH = IVP_ADDN_2X32(vecIntResH, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntResL = IVP_SRAN_2X32(vecIntResL, (xb_vecN_2x32v) (shift)); + vecIntResH = IVP_SRAN_2X32(vecIntResH, (xb_vecN_2x32v) (shift)); + vecOut0L = IVP_MAXN_2X32(vecIntResL, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MAXN_2X32(vecIntResH, (xb_vecN_2x32v) minLim); + + /* store output data */ + IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varlen << 2)); + IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, ((varlen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1, acc2, acc3; + acc0 = zeroInScale; + acc1 = zeroInScale; + acc2 = zeroInScale; + acc3 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + IVP_MULUSANX16(acc1, vecScale, vecInData1); + IVP_MULUSANX16(acc2, vecScale, vecInData2); + IVP_MULUSANX16(acc3, vecScale, vecInData3); + + xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0); + xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0); + xb_vecN_2x32v vecIntRes1L = IVP_CVT32SNX48L(acc1); + xb_vecN_2x32v vecIntRes1H = IVP_CVT32SNX48H(acc1); + xb_vecN_2x32v vecIntRes2L = IVP_CVT32SNX48L(acc2); + xb_vecN_2x32v vecIntRes2H = IVP_CVT32SNX48H(acc2); + xb_vecN_2x32v vecIntRes3L = IVP_CVT32SNX48L(acc3); + xb_vecN_2x32v vecIntRes3H = IVP_CVT32SNX48H(acc3); + + vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut0L = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift)); + vecOut0H = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift)); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + + vecIntRes1L = IVP_ADDN_2X32(vecIntRes1L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes1H = IVP_ADDN_2X32(vecIntRes1H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut1L = IVP_SRAN_2X32(vecIntRes1L, (xb_vecN_2x32v) (shift)); + vecOut1H = IVP_SRAN_2X32(vecIntRes1H, (xb_vecN_2x32v) (shift)); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + + vecIntRes2L = IVP_ADDN_2X32(vecIntRes2L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes2H = IVP_ADDN_2X32(vecIntRes2H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut2L = IVP_SRAN_2X32(vecIntRes2L, (xb_vecN_2x32v) (shift)); + vecOut2H = IVP_SRAN_2X32(vecIntRes2H, (xb_vecN_2x32v) (shift)); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + + vecIntRes3L = IVP_ADDN_2X32(vecIntRes3L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes3H = IVP_ADDN_2X32(vecIntRes3H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut3L = IVP_SRAN_2X32(vecIntRes3L, (xb_vecN_2x32v) (shift)); + vecOut3H = IVP_SRAN_2X32(vecIntRes3H, (xb_vecN_2x32v) (shift)); + vecOut3L = IVP_MAXN_2X32(vecOut3L, (xb_vecN_2x32v) minLim); + vecOut3H = IVP_MAXN_2X32(vecOut3H, (xb_vecN_2x32v) minLim); + + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut2H, vaOut, pvecOut); + + IVP_SAVN_2X32_XP(vecOut3L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut3H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 acc0, acc1, acc2; + acc0 = zeroInScale; + acc1 = zeroInScale; + acc2 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + IVP_MULUSANX16(acc1, vecScale, vecInData1); + IVP_MULUSANX16(acc2, vecScale, vecInData2); + + xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0); + xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0); + xb_vecN_2x32v vecIntRes1L = IVP_CVT32SNX48L(acc1); + xb_vecN_2x32v vecIntRes1H = IVP_CVT32SNX48H(acc1); + xb_vecN_2x32v vecIntRes2L = IVP_CVT32SNX48L(acc2); + xb_vecN_2x32v vecIntRes2H = IVP_CVT32SNX48H(acc2); + + + vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut0L = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift)); + vecOut0H = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift)); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + + vecIntRes1L = IVP_ADDN_2X32(vecIntRes1L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes1H = IVP_ADDN_2X32(vecIntRes1H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut1L = IVP_SRAN_2X32(vecIntRes1L, (xb_vecN_2x32v) (shift)); + vecOut1H = IVP_SRAN_2X32(vecIntRes1H, (xb_vecN_2x32v) (shift)); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + + vecIntRes2L = IVP_ADDN_2X32(vecIntRes2L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes2H = IVP_ADDN_2X32(vecIntRes2H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut2L = IVP_SRAN_2X32(vecIntRes2L, (xb_vecN_2x32v) (shift)); + vecOut2H = IVP_SRAN_2X32(vecIntRes2H, (xb_vecN_2x32v) (shift)); + vecOut2L = IVP_MAXN_2X32(vecOut2L, (xb_vecN_2x32v) minLim); + vecOut2H = IVP_MAXN_2X32(vecOut2H, (xb_vecN_2x32v) minLim); + + + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut2L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut2H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1; + acc0 = zeroInScale; + acc1 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + IVP_MULUSANX16(acc1, vecScale, vecInData1); + + xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0); + xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0); + xb_vecN_2x32v vecIntRes1L = IVP_CVT32SNX48L(acc1); + xb_vecN_2x32v vecIntRes1H = IVP_CVT32SNX48H(acc1); + + + + vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut0L = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift)); + vecOut0H = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift)); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + + vecIntRes1L = IVP_ADDN_2X32(vecIntRes1L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes1H = IVP_ADDN_2X32(vecIntRes1H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut1L = IVP_SRAN_2X32(vecIntRes1L, (xb_vecN_2x32v) (shift)); + vecOut1H = IVP_SRAN_2X32(vecIntRes1H, (xb_vecN_2x32v) (shift)); + vecOut1L = IVP_MAXN_2X32(vecOut1L, (xb_vecN_2x32v) minLim); + vecOut1H = IVP_MAXN_2X32(vecOut1H, (xb_vecN_2x32v) minLim); + + + + /* Store output data */ + IVP_SAN_2X32_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X32_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X32_XP(vecOut1L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut1H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int32_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x32v *) (pOut + (y * outTilePitch1)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + + xb_vecN_2x32v vecIntRes0L = IVP_CVT32SNX48L(acc0); + xb_vecN_2x32v vecIntRes0H = IVP_CVT32SNX48H(acc0); + + vecIntRes0L = IVP_ADDN_2X32(vecIntRes0L, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecIntRes0H = IVP_ADDN_2X32(vecIntRes0H, IVP_SLAN_2X32((xb_vecN_2x32v) 1, (xb_vecN_2x32v) (shift - 1))); + vecOut0L = IVP_SRAN_2X32(vecIntRes0L, (xb_vecN_2x32v) (shift)); + vecOut0H = IVP_SRAN_2X32(vecIntRes0H, (xb_vecN_2x32v) (shift)); + vecOut0L = IVP_MAXN_2X32(vecOut0L, (xb_vecN_2x32v) minLim); + vecOut0H = IVP_MAXN_2X32(vecOut0H, (xb_vecN_2x32v) minLim); + + + /* Store output data */ + IVP_SAVN_2X32_XP(vecOut0L, vaOut, pvecOut, (varLen << 2)); + IVP_SAVN_2X32_XP(vecOut0H, vaOut, pvecOut, ((varLen << 2) - (XCHAL_IVPN_SIMD_WIDTH << 1))); + IVP_SAPOSN_2X32_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_AsymQ_S8I64 ********************/ +/* Description : Q8 implementation for conversion from S8_ASYM to I64_SYM */ +/* Inputs : Input Tile, zeroIn, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is signed 8bit */ +/**************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroIn, + const uint16_t scale, + const uint8_t shift) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_TILE3D_I64(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_ERROR(shift < 24, XAI_ERR_NORM, \ + "Shift = %hhu, value should be less than 24", shift); + XAI_CHECK_ERROR((zeroIn >= -128) && (zeroIn < 128), XAI_ERR_NORM, \ + "\nzeroIn = %hi, value must be greater than or equal to -128 and less than 128", zeroIn); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), XAI_ERR_BADARG, \ + "\nData Order of InputTile = %d, OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + valign vaOut = IVP_ZALIGN(); + + /* Get Data Pointers */ + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pOutput = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* vectorization width */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + const int32_t vectorizationWidth2X = vectorizationWidth * 2; + const int32_t vectorizationWidth3X = vectorizationWidth * 3; + const int32_t vectorizationWidth4X = vectorizationWidth * 4; + + const int32_t minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) ? INT_MIN : 0; + //S16 x U16 = S32 , rounded and shifted back to S32. + //even though the output type is S64. Data is within S32 range so INT_MIN is sufficient. + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + xb_vecNx8 * restrict pvecIn; + xb_vecN_2x64w * restrict pvecOut; + + /* input and output data vectors */ + xb_vecNx16 vecInData0, vecInData1, vecInData2, vecInData3; + xb_vecN_2x64w vecOut0L, vecOut0H, vecOut1L, vecOut1H, vecOut2L, vecOut2H, vecOut3L, vecOut3H; + + xb_vecNx16U vecScale = (xb_vecNx16U) (scale); + xb_vecNx16 vecZeroIn = (xb_vecNx16) (-zeroIn); + xb_vecNx48 zeroInScale = IVP_MULUSNX16(vecScale, vecZeroIn); + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When input tile pitch is equal to input tile dim1 and input tile pitch */ + /* is equal to output tile pitch */ + /* - If above condition holds good, data elements for which data */ + /* conversion from S8 bit to S64 bit need to done present in contiguous */ + /* memory location. Hence vectorization can be utilized effectively */ + /* */ + /* 2. When input tile pitch is not equal to input tile size or input tile */ + /* pitch is not equal to output tile pitch */ + /* - In this scenario, data elements for which data conversion from S8 bit */ + /* S64 bit need to done exist in non-contiguous memory location. */ + /* In order to do vectorization across first dimension, output data */ + /* pointers need to be updated based on output tile size and output tile */ + /* pitch. */ + /******************************************************************************/ + + if ((inTilePitch1 == dim1Size) && (outTilePitch1 == dim1Size)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* input and output vectors */ + xb_vecNx16 vecInData; + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3Size; + int32_t maxLoopCount = dim1Size * dim2Size; + + /* Updated Loop count based on tile dimension configuration */ + if ((inTilePitch2 == maxLoopCount) && (outTilePitch2 == maxLoopCount)) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + + /* Update max loop counter */ + dim3MaxLoopCount = 1; + maxLoopCount *= dim3Size; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + xb_vecN_2x32v vecOutTempL, vecOutTempH; + /* initialize input and output data pointer */ + pvecIn = (xb_vecNx8 *) (pInput + (z * inTilePitch2)); + pvecOut = (xb_vecN_2x64w *) (pOutput + (z * outTilePitch2)); + valign vaInData = IVP_LANX8S_PP(pvecIn); + int32_t varlen; + + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* Load input data */ + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 acc = zeroInScale; + IVP_MULUSANX16(acc, vecScale, vecInData); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc), IVP_CVT64SNX48LL(acc)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc), IVP_CVT64SNX48HL(acc)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + } + varlen = (maxLoopCount - x); + IVP_LANX8S_IP(vecInData, vaInData, pvecIn); + + xb_vecNx48 acc = zeroInScale; + IVP_MULUSANX16(acc, vecScale, vecInData); + + xb_vecN_2x64w vecOutIntm1 = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc), IVP_CVT64SNX48LL(acc)); + vecOutTempL = IVP_PACKVRN_2X64W(vecOutIntm1, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + xb_vecN_2x64w vecOutIntm2 = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc), IVP_CVT64SNX48HL(acc)); + vecOutTempH = IVP_PACKVRN_2X64W(vecOutIntm2, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* store output data */ + IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varlen << 3)); + IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varlen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else + { + /* else block is executed if input tile pitch is not equal to input tile width or input tile */ + /* pitch is not equal to output tile pitch */ + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + xb_vecN_2x32v vecOutTempL, vecOutTempH; + x = 0; + /* Loop Unroll=4 along 1st dimension */ + for (; x < (dim1Size - vectorizationWidth3X); x += vectorizationWidth4X) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth3X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData3, vaInData, pvecIn); + + /* apply scale and shift to input data. + * multiplying with scale results in 32 way 48-bit + * data to which shift is applied, so final result is + * 32 way 16 bit. + */ + xb_vecNx48 acc0, acc1, acc2, acc3; + acc0 = zeroInScale; + acc1 = zeroInScale; + acc2 = zeroInScale; + acc3 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + IVP_MULUSANX16(acc1, vecScale, vecInData1); + IVP_MULUSANX16(acc2, vecScale, vecInData2); + IVP_MULUSANX16(acc3, vecScale, vecInData3); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc1), IVP_CVT64SNX48LL(acc1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc1), IVP_CVT64SNX48HL(acc1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc2), IVP_CVT64SNX48LL(acc2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc2), IVP_CVT64SNX48HL(acc2)); + vecOut3L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc3), IVP_CVT64SNX48LL(acc3)); + vecOut3H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc3), IVP_CVT64SNX48HL(acc3)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut3L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut3L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut3H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut3H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut2H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut3L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut3H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + if (x < (dim1Size - vectorizationWidth2X)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth2X); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData2, vaInData, pvecIn); + + xb_vecNx48 acc0, acc1, acc2; + acc0 = zeroInScale; + acc1 = zeroInScale; + acc2 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + IVP_MULUSANX16(acc1, vecScale, vecInData1); + IVP_MULUSANX16(acc2, vecScale, vecInData2); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc1), IVP_CVT64SNX48LL(acc1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc1), IVP_CVT64SNX48HL(acc1)); + vecOut2L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc2), IVP_CVT64SNX48LL(acc2)); + vecOut2H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc2), IVP_CVT64SNX48HL(acc2)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut2L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut2H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut2H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut1H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut2L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut2H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < (dim1Size - vectorizationWidth)) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - (x + vectorizationWidth); + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + IVP_LANX8S_IP(vecInData1, vaInData, pvecIn); + + xb_vecNx48 acc0, acc1; + acc0 = zeroInScale; + acc1 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + IVP_MULUSANX16(acc1, vecScale, vecInData1); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0)); + vecOut1L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc1), IVP_CVT64SNX48LL(acc1)); + vecOut1H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc1), IVP_CVT64SNX48HL(acc1)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut1L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut1H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut1H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAN_2X64W_IP(vecOut0L, vaOut, pvecOut); + IVP_SAN_2X64W_IP(vecOut0H, vaOut, pvecOut); + IVP_SAVN_2X64W_XP(vecOut1L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut1H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + else if (x < dim1Size) + { + /* Initialize input and output data pointer */ + int8_t * pIn = &pInput[z * inTilePitch2 + x]; + int64_t *pOut = &pOutput[z * outTilePitch2 + x]; + int32_t varLen = dim1Size - x; + + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + pvecIn = (xb_vecNx8 *) (pIn + (y * inTilePitch1)); + pvecOut = (xb_vecN_2x64w *) (pOut + (y * outTilePitch1)); + + valign vaInData = IVP_LANX8S_PP(pvecIn); + /* load input data */ + IVP_LANX8S_IP(vecInData0, vaInData, pvecIn); + + xb_vecNx48 acc0; + acc0 = zeroInScale; + + IVP_MULUSANX16(acc0, vecScale, vecInData0); + + vecOut0L = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(acc0), IVP_CVT64SNX48LL(acc0)); + vecOut0H = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(acc0), IVP_CVT64SNX48HL(acc0)); + + vecOutTempL = IVP_PACKVRN_2X64W(vecOut0L, shift); + vecOutTempL = IVP_MAXN_2X32(vecOutTempL, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0L = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempL); + + vecOutTempH = IVP_PACKVRN_2X64W(vecOut0H, shift); + vecOutTempH = IVP_MAXN_2X32(vecOutTempH, (xb_vecN_2x32v) minLim); + //sign extending to 64bit + vecOut0H = IVP_MULUSN_2X16X32_0((xb_vecNx16U) 1, vecOutTempH); + + /* Store output data */ + IVP_SAVN_2X64W_XP(vecOut0L, vaOut, pvecOut, (varLen << 3)); + IVP_SAVN_2X64W_XP(vecOut0H, vaOut, pvecOut, ((varLen << 3) - (XCHAL_IVPN_SIMD_WIDTH << 2))); + IVP_SAPOSN_2X64W_FP(vaOut, pvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/********************* xaiDataConversion3D_AsymQ *********************************/ +/* Description : General API for DataConversion3D_AsymQ optimized implementation */ +/* Calls one of the DataConversion3D_AsymQ functions based */ +/* on the parameters */ +/* Inputs : Input Tile, zeroPoint, scale, shift */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/*********************************************************************************/ +XAI_ERR_TYPE xaiDataConversion3D_AsymQ(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroPoint, + const uint16_t scale, + const uint8_t shift) +{ + if ((!inTile) || (!outTile)) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + // Converts S8_SYM/S8_ASYM input to S8_SYM/S8_ASYM output (The "zeroPoint" used here serves as "fixUp" for the API) + return(xaiDataConversion3D_AsymQ_S8S8(inTile, outTile, zeroPoint, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) + { + // Converts S8_ASYM input to U8_SYM output (The "zeroPoint" used here serves as "fixUp" for the API) + return(xaiDataConversion3D_AsymQ_S8U8(inTile, outTile, zeroPoint, scale, shift)); + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + if (zeroPoint == 0) + { + return(xaiDataConversion3D_S8S16(inTile, outTile, scale, shift)); + } + else + { + // Converts S8_ASYM input to S16_SYM output (The "zeroPoint" used here serves as "fixUp" for the API) + return(xaiDataConversion3D_AsymQ_S8S16(inTile, outTile, zeroPoint, scale, shift)); + } + } + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + // Converts S8_ASYM input to U16_SYM output (The "zeroPoint" used here serves as "fixUp" for the API) + return(xaiDataConversion3D_AsymQ_S8U16(inTile, outTile, zeroPoint, scale, shift)); + } + else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32))) + { + // Converts S8_ASYM input to I32 output (The "zeroPoint" used here serves as "ZeroIn" for the API) + return(xaiDataConversion3D_AsymQ_S8I32(inTile, outTile, zeroPoint, scale, shift)); + } + else if ((XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64))) + { + // Converts S8_ASYM input to I64 output (The "zeroPoint" used here serves as "ZeroIn" for the API) + return(xaiDataConversion3D_AsymQ_S8I64(inTile, outTile, zeroPoint, scale, shift)); + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + if (zeroPoint == 0) + { + return(xaiDataConversion3D_U8S8(inTile, outTile, scale, shift)); + } + else + { + // Converts U8_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API) + return(xaiDataConversion3D_AsymQ_U8S8(inTile, outTile, zeroPoint, scale, shift)); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + if (zeroPoint == 0) + { + return(xaiDataConversion3D_S16I8(inTile, outTile, scale, shift)); + } + else + { + // Converts S16_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API) + return(xaiDataConversion3D_AsymQ_S16S8(inTile, outTile, zeroPoint, scale, shift)); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U16)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + if (zeroPoint == 0) + { + return(xaiDataConversion3D_U16I8(inTile, outTile, scale, shift)); + } + else + { + // Converts U16_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API) + // return(xaiDataConversion3D_U16AS8(inTile, outTile, zeroPoint, scale, shift)); + return(xaiDataConversion3D_AsymQ_U16S8(inTile, outTile, zeroPoint, scale, shift)); + } + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S32)) + { + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + if (zeroPoint == 0) + { + return(xaiDataConversion3D_S32S8(inTile, outTile, scale, shift)); + } + else + { + // Converts S32_SYM input to S8_ASYM output (The "zeroPoint" used here serves as "ZeroOut" for the API) + return(xaiDataConversion3D_AsymQ_S32S8(inTile, outTile, zeroPoint, scale, shift)); + } + } + } + + return(XAI_ERR_NO_VARIANT); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c new file mode 100644 index 00000000000..d10991e879c --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef DILATED_VQ_CONV +#include "cnn_dilated_conv_MOD.h" + +/******************************* end of MOD variants ***************************************/ +/*******************************************************************************************/ +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h new file mode 100644 index 00000000000..0c5dd9a0e33 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD.h @@ -0,0 +1,16078 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + + +#if ((XCHAL_VISION_TYPE >= 6)) + +/****************************************************************************************** +* MOD WHD DWH variants +******************************************************************************************/ + +/***************************************************************************** +* xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 1x1 MOD_WHD_DWH 3D */ +/* dilated convolution function and 1x1 MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \ + XAI_ERR_BADARG, "\nDilation = %hhu\nDilation should be 1", XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + /* Max value of Gather Offset is (min(numInCh-1,7)*inDataPitch2 + stride*min(3,outWidth-1)) */ + if (numInCh > 1) + { + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7)), \ + XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7))); + } + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideU = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitch of Coefficient Data (NDWH) in dim1 (W = 1 and H = 1) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecOut; + +#if XCHAL_HAVE_SUPERGATHER == 0 + xb_vec2Nx8* pdvecCoeff1; + xb_vec2Nx8* pdvecCoeff2; + valign vIn; + xb_vec2Nx8* pdvecIn1; + xb_vec2Nx8* pdvecIn2; + + /*updating sel1 corresponding to 8 outCh and,4 width from input, hence + for 8 input channel and 4 width elements from each load selection, + sel1=0,64,0+strideU,64+strideU,0+2*strideU,64+2*strideU,0+3*strideU,64+3*strideU,0+4*strideU,64+4*strideU,... + ...0+7*strideU,64+7*strideU*/ + xb_vec2Nx8U sel = IVP_SEQ2NX8(); + xb_vecNx16U off = IVP_MULNX16PACKL(IVP_ANDNX16(1, IVP_SEQNX16()), 64); + xb_vec2Nx8U off1 = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(off), IVP_MOV2NX8_FROMNX16(off), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + xb_vec2Nx8U sel1 = 0, sel2 = 0; + sel2 = IVP_SEL2NX8UI(IVP_MUL2NX8(IVP_SEQ2NX8U(), strideU), IVP_MUL2NX8(IVP_SEQ2NX8U(), strideU), IVP_SELI_8B_INTERLEAVE_1_LO); + sel2 = IVP_ADD2NX8U(sel2, off1); + IVP_SEL2NX8UT(sel1, 0, sel2, IVP_SEQ2NX8U(), IVP_LT2NX8(sel, 16)); + + xb_vec2Nx8 dvecIn = 0, dvecIn1 = 0, dvecIn2 = 0, dvecIn3 = 0, dvecIn4 = 0; + xb_vec2Nx8 dvecIn5 = 0, dvecIn6 = 0, dvecIn7 = 0, dvecIn8 = 0; + + /*implementation follows loading 8 input vectors corresponding to 8 inCh and ,first four elements + along width */ + + int32_t remainingInCh = numInCh - ((numInCh >> 3) << 3); + + uint8_t remCh1 = 0, remCh2 = 0, remCh3 = 0, remCh4 = 0, remCh5 = 0, remCh6 = 0; + int32_t sumMask1 = 0, sumMask2 = 0; + + if (remainingInCh != 0) /* if numInCh is not a multiple of 8*/ + { + /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/ + /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */ + /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/ + /* Coefficient mask entries for channels grea ter than the remainingInCh are set to 0 */ + remCh1 = XT_SALT(1, remainingInCh); + remCh2 = XT_SALT(2, remainingInCh); + remCh3 = XT_SALT(3, remainingInCh); + remCh4 = XT_SALT(4, remainingInCh); + remCh5 = XT_SALT(5, remainingInCh); + remCh6 = XT_SALT(6, remainingInCh); + + /*Generation of maskLut for handling cases when remainingInCh is not equal to 0 */ + /*eg. if remainingInCh is equal to 2 then sumMask1 is 00FFFFFF and sumMask2 is 0 */ + /* if remainingInCh is equal to 3 then sumMask1 is FFFFFFFF and sumMask2 is 0 */ + /* if remainingInCh is equal to 4 then sumMask1 is FFFFFFFF and sumMask2 is FF */ + const uint32_t maskLut[4] = { 0xff, 0xff00, 0xff0000, 0xff000000 }; + + sumMask1 = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2 + maskLut[3] * remCh3; + sumMask2 = maskLut[0] * remCh4 + maskLut[1] * remCh5 + maskLut[2] * remCh6; + } + + /* Unrolling of 4 is done along output width and 8 along input channels */ + /** Loop Starts **/ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Along output channels*/ + { + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y++) /* Along output height*/ + { + for (x = 0; x < outW; x += 4) /*Along output width*/ + { + /* Input Data and Output Data Pointers */ + int8_t* pSrc = pInData + y * inDataPitch1 * strideU + x * strideU; + int8_t* pOut = &pOutData[(y * outDataPitch2 + x * outDataPitch1) * bytesPerPixel]; + + /* For corner case handling */ + int32_t remainingX = XT_MIN(4, outW - x); + + /* Loading bias and initializing sum with bias*/ + xb_vec2Nx24 dvecSum0, dvecSum1, dvecSum2, dvecSum3; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, dvecSum0, dvecSum1, dvecSum2, dvecSum3); + + /* Coefficient Pointer */ + pdvecCoeff1 = (xb_vec2Nx8 *) (&pCoeffData[outCh]); + pdvecCoeff2 = (xb_vec2Nx8 *) (&pCoeffData[outCh] + coeffPitch1); + pdvecIn1 = (xb_vec2Nx8 *) pSrc; + pdvecIn2 = (xb_vec2Nx8 *) (pSrc + inDataPitch2); + + for (inCh = 0; inCh < (numInCh - 7); inCh += 8) + { + /*Loading input vector */ + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn1, vIn, pdvecIn1, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn2); + IVP_LA2NX8_XP(dvecIn2, vIn, pdvecIn2, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn3, vIn, pdvecIn1, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn2); + IVP_LA2NX8_XP(dvecIn4, vIn, pdvecIn2, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn5, vIn, pdvecIn1, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn2); + IVP_LA2NX8_XP(dvecIn6, vIn, pdvecIn2, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn7, vIn, pdvecIn1, 2 * inDataPitch2); + + vIn = IVP_LA2NX8_PP(pdvecIn2); + IVP_LA2NX8_XP(dvecIn8, vIn, pdvecIn2, 2 * inDataPitch2); + + /*dvecIn,dvecIn1 loaded with first 2 and next 2 elements of inChannels as x + is unrolled 4 times loaded as first element of dvecIn1,first element of dvecIn2....first element of dvecIn4, + second element of dvecIn1,second element of dvecIn2....second element of dvecIn4, + third element of dvecIn1,third element of dvecIn2....third element of dvecIn4, + fourth element of dvecIn1,fourth element of dvecIn2....fourth element of dvecIn4 for + dvecIn, for dvecIn2 next four elements of input*/ + dvecIn = IVP_SEL2NX8(dvecIn2, dvecIn1, sel1); + dvecIn2 = IVP_SEL2NX8(dvecIn4, dvecIn3, sel1); + dvecIn1 = IVP_SEL2NX8(dvecIn6, dvecIn5, sel1); + dvecIn3 = IVP_SEL2NX8(dvecIn8, dvecIn7, sel1); + dvecIn = IVP_SEL2NX8I(dvecIn2, dvecIn, IVP_SELI_INTERLEAVE_1_LO); + dvecIn1 = IVP_SEL2NX8I(dvecIn3, dvecIn1, IVP_SELI_INTERLEAVE_1_LO); + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff1, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff2, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff1, 2 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff7; + IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff2, 2 * coeffPitch1); + + + /* Load 4 bytes(4 channels) of input data along the depth to int32_t scalar */ + xb_vecN_2x32v hvecIn = IVP_MOVN_2X32_FROM2NX8(dvecIn); + xb_vecN_2x32v hvecIn1 = IVP_MOVN_2X32_FROM2NX8(dvecIn1); + + int32_t scalarInData0 = IVP_EXTRN_2X32(hvecIn, 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(hvecIn1, 0); + + int32_t scalarInData2 = IVP_EXTRN_2X32(hvecIn, 1); + int32_t scalarInData3 = IVP_EXTRN_2X32(hvecIn1, 1); + + int32_t scalarInData4 = IVP_EXTRN_2X32(hvecIn, 2); + int32_t scalarInData5 = IVP_EXTRN_2X32(hvecIn1, 2); + + int32_t scalarInData6 = IVP_EXTRN_2X32(hvecIn, 3); + int32_t scalarInData7 = IVP_EXTRN_2X32(hvecIn1, 3); + + /* Multiply and accumulate */ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6); + + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7); + } /* end of for(inCh = 0; inCh < numInCh; inCh+=8)*/ + + if (inCh < numInCh) + { + /*Loading input vector */ + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn1, vIn, pdvecIn1, inDataPitch2 * remCh1); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn2, vIn, pdvecIn1, inDataPitch2 * remCh2); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn3, vIn, pdvecIn1, inDataPitch2 * remCh3); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn4, vIn, pdvecIn1, inDataPitch2 * remCh4); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn5, vIn, pdvecIn1, inDataPitch2 * remCh5); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn6, vIn, pdvecIn1, inDataPitch2 * remCh6); + + vIn = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecIn7, vIn, pdvecIn1, inDataPitch2); + + dvecIn = IVP_SEL2NX8(dvecIn2, dvecIn1, sel1); + dvecIn2 = IVP_SEL2NX8(dvecIn4, dvecIn3, sel1); + dvecIn1 = IVP_SEL2NX8(dvecIn6, dvecIn5, sel1); + dvecIn3 = IVP_SEL2NX8(dvecIn8, dvecIn7, sel1); + dvecIn = IVP_SEL2NX8I(dvecIn2, dvecIn, IVP_SELI_INTERLEAVE_1_LO); + dvecIn1 = IVP_SEL2NX8I(dvecIn3, dvecIn1, IVP_SELI_INTERLEAVE_1_LO); + + /* Load 4 bytes(4 channels) of input data along the depth to int32_t scalar */ + xb_vecN_2x32v hvecIn = IVP_MOVN_2X32_FROM2NX8(dvecIn); + xb_vecN_2x32v hvecIn1 = IVP_MOVN_2X32_FROM2NX8(dvecIn1); + + int32_t scalarInData0 = IVP_EXTRN_2X32(hvecIn, 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(hvecIn1, 0); + + int32_t scalarInData2 = IVP_EXTRN_2X32(hvecIn, 1); + int32_t scalarInData3 = IVP_EXTRN_2X32(hvecIn1, 1); + + int32_t scalarInData4 = IVP_EXTRN_2X32(hvecIn, 2); + int32_t scalarInData5 = IVP_EXTRN_2X32(hvecIn1, 2); + + int32_t scalarInData6 = IVP_EXTRN_2X32(hvecIn, 3); + int32_t scalarInData7 = IVP_EXTRN_2X32(hvecIn1, 3); + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff1, coeffPitch1 * remCh1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh2); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh3); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1 * remCh4); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1 * remCh5); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff1, coeffPitch1 * remCh6); + + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate */ + /* Masking the scalarInData to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1); + + IVP_MULQA2N8XR8(dvecSum0, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2); + IVP_MULQA2N8XR8(dvecSum1, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2); + IVP_MULQA2N8XR8(dvecSum2, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2); + IVP_MULQA2N8XR8(dvecSum3, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2); + } /* end of if (inCh < numInCh)*/ + + /* Storing output vector to memory */ + xb_vec2Nx8 dvecOutData0L, dvecOutData1L, dvecOutData2L, dvecOutData3L; + xb_vec2Nx8 dvecOutData0H, dvecOutData1H, dvecOutData2H, dvecOutData3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + pdvecOut = (xb_vec2Nx8 *) &pOut[outCh * bytesPerPixel]; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOutData0H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 1)]; + IVP_SAV2NX8_XP(dvecOutData1L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 1)); + IVP_SAV2NX8_XP(dvecOutData1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 1)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 2 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 2)]; + IVP_SAV2NX8_XP(dvecOutData2L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 2)); + IVP_SAV2NX8_XP(dvecOutData2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 2)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 3 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 3)]; + IVP_SAV2NX8_XP(dvecOutData3L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 3)); + IVP_SAV2NX8_XP(dvecOutData3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 3)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* end of for(x = 0; x < outW; x+=4)*/ + } /* end of for(y = 0; y < outH; y++)*/ + } /* end of for(outCh = 0; outCh < numOutCh; outCh+=2*XCHAL_IVPN_SIMD_WIDTH)*/ + +#else +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + xb_vec2Nx8* restrict pdvecCoeff; + + /* This implementation uses gather operation to load 4 bytes of data each from 8 channels */ + + /***** Gather Offset Computation - 8channels, 4cols, 1row *****/ + /*offset = pitch*[0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] + */ + /* stride*[0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3] */ + /* where [0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3] =>> column indices */ + /* [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] =>> channel indices */ + xb_vecNx16U vecOffsets0 = IVP_MULNX16PACKL(IVP_ANDNX16(7, IVP_SEQNX16()), inDataPitch2); + IVP_MULANX16PACKL(vecOffsets0, IVP_SRLINX16(IVP_SEQNX16(), 3), strideU); + + + /******* Gather Offset Computation and Coeff Mask ********/ + /******* for Corner Case : (InCh < numInCh) && (InCh > (numInCh -7)) ********/ + + int32_t remainingInCh = numInCh - ((numInCh >> 3) << 3); + + xb_vecNx16U vecOffsets1 = (xb_vecNx16U) 0; + uint8_t remCh1 = 0, remCh2 = 0, remCh3 = 0, remCh4 = 0, remCh5 = 0, remCh6 = 0; + int32_t sumMask1 = 0, sumMask2 = 0; + + if (remainingInCh != 0) /* if numInCh is not a multiple of 8*/ + { + /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/ + /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */ + + /* Finding the gather offset such that valid memory locations are accessed */ + /* [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] in offset calculation is modified such */ + /* that columns greater than (remainingInCh-1) are set to (remainingInCh-1) */ + xb_vecNx16 vecRemainingInChIdx = IVP_MINNX16(IVP_ANDNX16(7, IVP_SEQNX16()), remainingInCh - 1); + vecOffsets1 = IVP_MULNX16PACKL(vecRemainingInChIdx, inDataPitch2); + IVP_MULANX16PACKL(vecOffsets1, IVP_SRLINX16(IVP_SEQNX16(), 3), strideU); + + /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/ + /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */ + remCh1 = XT_SALT(1, remainingInCh); + remCh2 = XT_SALT(2, remainingInCh); + remCh3 = XT_SALT(3, remainingInCh); + remCh4 = XT_SALT(4, remainingInCh); + remCh5 = XT_SALT(5, remainingInCh); + remCh6 = XT_SALT(6, remainingInCh); + + /*Generation of maskLut for handling cases when remainingInCh is not equal to 0 */ + /*eg. if remainingInCh is equal to 2 then sumMask1 is 00FFFFFF and sumMask2 is 0 */ + /* if remainingInCh is equal to 3 then sumMask1 is FFFFFFFF and sumMask2 is 0 */ + /* if remainingInCh is equal to 4 then sumMask1 is FFFFFFFF and sumMask2 is FF */ + const uint32_t maskLut[4] = { 0xff, 0xff00, 0xff0000, 0xff000000 }; + + sumMask1 = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2 + maskLut[3] * remCh3; + sumMask2 = maskLut[0] * remCh4 + maskLut[1] * remCh5 + maskLut[2] * remCh6; + } + + /* Unrolling of 4 is done along output width and 8 along input channels */ + /** Loop Starts **/ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Along output channels*/ + { + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y++) /* Along output height*/ + { + for (x = 0; x < outW; x += 4) /*Along output width*/ + { + xb_vecNx16U vecOffsets2; + xb_vecNx16U vecOffsets3; + /* Input Data and Output Data Pointers */ + int8_t* pSrc = pInData + y * inDataPitch1 * strideU + x * strideU; + int8_t* pOut = &pOutData[(y * outDataPitch2 + x * outDataPitch1) * bytesPerPixel]; + + /* For corner case handling */ + int32_t remainingX = XT_MIN(4, outW - x); + vboolN vbOffsetMask = IVP_LTRSN(8 * remainingX); /*8 channels*/ + /* Assign valid address for predicated false lines */ + vecOffsets2 = IVP_MOVNX16UT(vecOffsets0, 0, vbOffsetMask); + vecOffsets3 = IVP_MOVNX16UT(vecOffsets1, 0, vbOffsetMask); + /* Loading bias and initializing sum with bias*/ + xb_vec2Nx24 dvecSum0, dvecSum1, dvecSum2, dvecSum3; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, dvecSum0, dvecSum1, dvecSum2, dvecSum3); + + /* Coefficient Pointer */ + pdvecCoeff = (xb_vec2Nx8 *) (&pCoeffData[outCh]); + + for (inCh = 0; inCh < (numInCh - 7); inCh += 8) + { + /* Gather Operation to load 8 channels of 1x4 block of input . dvecIn will contain data */ + /* from 8 channels corresponding to same x and y value in consecutive positions. */ + xb_gsr gatherReg = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets2); + xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/ + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff7; + IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1); + + + /* Load 4 bytes(4 channels) of input data along the depth to int32_t scalar */ + int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 1); + + int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 2); + int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 3); + + int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 4); + int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 5); + + int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 6); + int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 7); + + /* Multiply and accumulate */ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6); + + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7); + } /* end of for(inCh = 0; inCh < numInCh; inCh+=8)*/ + + if (inCh < numInCh) + { + /* Gather Operation to load remainingCh number of channels corresponding to 1x4 block */ + /* of input. The channels to be loaded are handled by vecOffsets1 */ + xb_gsr gatherReg = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets3); + xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/ + + /* Load 4 bytes of input data along the depth to int32_t scalar */ + int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 1); + + int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 2); + int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 3); + + int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 4); + int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 5); + + int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 6); + int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 7); + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1 * remCh1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh2); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh3); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1 * remCh4); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 * remCh5); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1 * remCh6); + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1); + + /* Multiply and accumulate */ + /* Masking the scalarInData to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1); + + IVP_MULQA2N8XR8(dvecSum0, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2); + IVP_MULQA2N8XR8(dvecSum1, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2); + IVP_MULQA2N8XR8(dvecSum2, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2); + IVP_MULQA2N8XR8(dvecSum3, 0, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2); + } /* end of if (inCh < numInCh)*/ + + /* Storing output vector to memory */ + xb_vec2Nx8 dvecOutData0L, dvecOutData1L, dvecOutData2L, dvecOutData3L; + xb_vec2Nx8 dvecOutData0H, dvecOutData1H, dvecOutData2H, dvecOutData3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + pdvecOut = (xb_vec2Nx8 *) &pOut[outCh * bytesPerPixel]; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOutData0H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 1)]; + IVP_SAV2NX8_XP(dvecOutData1L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 1)); + IVP_SAV2NX8_XP(dvecOutData1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 1)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 2 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 2)]; + IVP_SAV2NX8_XP(dvecOutData2L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 2)); + IVP_SAV2NX8_XP(dvecOutData2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 2)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 3 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 3)]; + IVP_SAV2NX8_XP(dvecOutData3L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 3)); + IVP_SAV2NX8_XP(dvecOutData3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 3)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* end of for(x = 0; x < outW; x+=4)*/ + } /* end of for(y = 0; y < outH; y++)*/ + } /* end of for(outCh = 0; outCh < numOutCh; outCh+=2*XCHAL_IVPN_SIMD_WIDTH)*/ +#endif + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 2x2 MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 2x2 MOD_WHD_DWH 3D */ +/* dilated convolution function and 2x2 MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 2x2xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0), \ + XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + if (XAI_CNN_CONV_GET_DILATION(param) > 1) + { + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1, \ + XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \ + XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param)); + } + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 1) + { + /* Max value of Gather Offset is (stride* inDataPitch1)+ stride + (min(numInCh-1,3)*inDataPitch2 + dilation) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - (XAI_CNN_CONV_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1_PITCH(inTile) * \ + XT_MIN(1, outH - 1)) - XAI_CNN_CONV_GET_STRIDE(param)) - XAI_CNN_CONV_GET_DILATION(param)) / \ + XT_MIN(numInCh - 1, 3), \ + XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - (XAI_CNN_CONV_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1_PITCH(inTile) * \ + XT_MIN(1, outH - 1)) - XAI_CNN_CONV_GET_STRIDE(param)) - XAI_CNN_CONV_GET_DILATION(param)) / \ + XT_MIN(numInCh - 1, 3)); + } + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t dilation = XAI_CNN_CONV_GET_DILATION(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + const int32_t kSizeU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t dilatedKWidthU = dilation * (kSizeU - 1) + 1; + int32_t dilatedKHeightU = dilation * (kSizeU - 1) + 1; + int32_t leftEdge, topEdge; + + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + /* move to start of edge data including edges */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + valign vaOutData = IVP_ZALIGN(); + + /* Only one Gather is used in the inner most loop in this + * approach to get the Input Data for 4 Output Vectors. + * In every Gather, 32 elements are read, where first 16 + * of them correspond to two vectors of Output along the width + * and the other 16 of them correspond to two vectors of Output + * along the height. To get the index values for the Gather, + * the following calculations are made. + */ + + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16 vecGather0123 = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4...7 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 24, IVP_NOTBN(IVP_LTRNI(8))); + /* To get - 0 0 0 0 d*1 d*1 d*1 d*1 d*2 d*2 d*2 d*2 d*3 d*3 d*3 d*3... */ + xb_vecNx16U vecGatherOff = IVP_SRLINX16(IVP_SEQNX16(), 2); + vecGatherOff = IVP_MULNX16UPACKL(vecGatherOff, (uint16_t) dilation); + /* Sequence - 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 .. */ + IVP_MULANX16PACKL(vecGatherOff, vecGather0123, inDataPitch2); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride), \ + vecGatherOff, vecSelIdx); + vecSelIdx = IVP_SEQNX16(); + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 16, IVP_NOTBN(IVP_LTRNI(16))); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), vecGatherOff, vecSelIdx); + + /* + The generated sequence is: + * 0 P2 2*P2 3*P2 + * d P2+d 2*P2+d 3*P2+d + * s s+P2 s+2*P2 s+3*P2 + * s+d*1 s+P2+d s+2*P2+d s+3*P2+d + * (s*P1)+0 (s*P1)+P2 (s*P1)+2*P2 (s*P1)+3*P2 + * (s*P1)+d (s*P1)+P2+d (s*P1)+2*P2+d (s*P1)+3*P2+d + * (s*P1)+s (s*P1)+s+P2 (s*P1)+s+2*P2 (s*P1)+s+3*P2 + * (s*P1)+s+d (s*P1)+s+P2+d (s*P1)+s+2*P2+d (s*P1)+s+3*P2+d + */ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + + int32_t remInCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remInCh + 1); + uint8_t remCh2 = XT_SALT(3, remInCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) + (y * stride) * inDataPitch1; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbXY = IVP_LTRSN((16 * numY) + 8 * (numX + 1)); + + /* Initialise input data pointers */ + pData1 = pData; + pData2 = pData + (dilation * inDataPitch1); + + /* Initialise co-efficient pointer */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff); + + /* Assign gather offset considering corner cases of odd output height and width */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbXY); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data corresponding to ky=0 */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + + /* Gather Input Data corresponding to ky=1 */ + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff1); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* kx = 0, ky =0 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, \ + coeffPitch2 - (3 * coeffPitch1)); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 1, ky = 0*/ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, -3 * coeffPitch1 - coeffPitch2 + coeffPitch3); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 0, ky =1 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - (3 * coeffPitch1)); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + + /* kx = 1, ky = 1*/ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - coeffPitch2 - coeffPitch3); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + pData1 += (4 * inDataPitch2); + pData2 += (4 * inDataPitch2); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (remInCh) + { + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbXY)); + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData2 = 0; + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff1); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* kx = 0, ky = 0 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 1, ky = 0 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, (-(remCh1 + remCh2) * coeffPitch1) - coeffPitch2 + coeffPitch3); + + /* Masking the qmulScalar values to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 0, ky = 1 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (remCh1 + remCh2) * coeffPitch1); + + /* Masking the qmulScalar values to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 1, ky = 1*/ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels Corner case Handling */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 3x3 MOD_WHD_DWH 3D */ +/* dilated convolution function and 3x3 MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0), \ + XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE(inTile, 1 + (XAI_CNN_CONV_GET_DILATION(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + + if (XAI_CNN_CONV_GET_DILATION(param) > 1) + { + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1, \ + XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \ + XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param)); + } + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 1) + { + /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 2 * dilation) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - \ + 2 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3)), \ + XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - \ + 2 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3))); + } + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t dilation = XAI_CNN_CONV_GET_DILATION(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + const int32_t kSizeU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t dilatedKSize = dilation * (kSizeU - 1) + 1; + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-((dilatedKSize / 2) * inDataPitch1 + (dilatedKSize / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + valign vaOutData = IVP_ZALIGN(); + + /* Only 2 Gathers are used in this approach to get the + * Input Data for 4 Output Vectors. In each Gather, + * 24 elements are read, where each 12 of them correspond + * to one vector of Output along the width. To get the + * index values for the Gather, the following calculations + * are made. + */ + + /* Gather Index Calculations */ + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16 vecGather0123 = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, (XCHAL_IVPN_SIMD_WIDTH - 12), IVP_NOTBN(IVP_LTRNI(12))); + /* To get - 0 0 0 0 d*1 d*1 d*1 d*1 d*2 d*2 d*2 d*2 d*3 d*3 d*3 d*3... */ + xb_vecNx16U vecGatherOff = IVP_SRLINX16(IVP_SEQNX16(), 2); + vecGatherOff = IVP_MULNX16UPACKL(vecGatherOff, (uint16_t) dilation); + /* Sequence - 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 .. */ + IVP_MULANX16PACKL(vecGatherOff, vecGather0123, inDataPitch2); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride), \ + vecGatherOff, vecSelIdx); + /* Final Index Pattern is - + * 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 + * s s+P2 s+2*P2 s+3*P2 s+d*1 s+P2+d*1 s+2*P2+d*1 s+3*P2+d*1 s+2 s+P2+d*2 s+2*P2+d*2 s+3*P2+d*2*/ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remInCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remInCh + 1); + uint8_t remCh2 = XT_SALT(3, remInCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + xb_vecNx16U vecGatherOff2; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) + (y * stride) * inDataPitch1; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbX = IVP_LTRSN(12 * (numX + 1)); + vboolN vbY = IVP_LTRSN(12 * (numX + 1) * numY); + + for (ky = 0; ky < 3; ky++) /* Kernel Height Loop */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilation * inDataPitch1; + pData2 = pData1 + (stride * inDataPitch1 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + + pData1 += 4 * inDataPitch2; + pData2 += 4 * inDataPitch2; + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - \ + 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 2 * coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX)); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY)); + + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh2 + remCh1))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values*/ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels Corner case Handling */ + } /* End Kernel Height Loop */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 2x2 MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 2x2 MOD_WHD_DWH 3D */ +/* dilated convolution function and 2x2 MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 4x4xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0), \ + XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + if (XAI_CNN_CONV_GET_DILATION(param) > 1) + { + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1, \ + XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \ + XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param)); + } + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 1) + { + /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 3 * dilation) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 3 * XAI_CNN_CONV_GET_DILATION(param)) / \ + XT_MIN(numInCh - 1, 3)), \ + XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 3 * XAI_CNN_CONV_GET_DILATION(param)) / \ + XT_MIN(numInCh - 1, 3))); + } + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t dilation = XAI_CNN_CONV_GET_DILATION(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + const int32_t kSizeU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t dilatedKSize = dilation * (kSizeU - 1) + 1; + int32_t leftEdge, topEdge; + + if ((dilatedKSize % 2) != 0) + { + leftEdge = dilatedKSize / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKSize / 2) : ((dilatedKSize / 2) - 1); + } + + if ((dilatedKSize % 2) != 0) + { + topEdge = dilatedKSize / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKSize / 2) : ((dilatedKSize / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + valign vaOutData = IVP_ZALIGN(); + + /* Only 2 Gathers are used in this approach to get the + * Input Data for 4 Output Vectors. In each Gather, + * 32 elements are read, where each 16 of them correspond + * to one vector of Output along the width. To get the + * index values for the Gather, the following calculations + * are made. + */ + + /* Gather Index Calculations */ + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16 vecGather0123 = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 16, IVP_NOTBN(IVP_LTRNI(16))); + /* To get - 0 0 0 0 d*1 d*1 d*1 d*1 d*2 d*2 d*2 d*2 d*3 d*3 d*3 d*3... */ + xb_vecNx16U vecGatherOff = IVP_SRLINX16(IVP_SEQNX16(), 2); + vecGatherOff = IVP_MULNX16UPACKL(vecGatherOff, (uint16_t) dilation); + /* Sequence - 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 .. */ + IVP_MULANX16PACKL(vecGatherOff, vecGather0123, inDataPitch2); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride), \ + vecGatherOff, vecSelIdx); + + /* Final Index Pattern is - + * First 16 elements + * 0 P2 2*P2 3*P2 + * d*1 P2+d*1 2*P2+d*1 3*P2+d*1 + * d*2 P2+d*2 2*P2+d*2 3*P2+d*2 + * d*3 P2+d*3 2*P2+d*3 3*P2+d*3 + * + * Last 16 elements + * s s+P2 s+2*P2 s+3*P2 + * s+d*1 s+P2+d*1 s+2*P2+d*1 s+3*P2+d*1 + * s+d*2 s+P2+d*2 s+2*P2+d*2 s+3*P2+d*2 + * s+d*3 s+P2+d*3 s+2*P2+d*3 s+3*P2+d*3 + */ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecCoeff3; + xb_vec2Nx8* restrict pdvecCoeff4; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + + int32_t remInCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remInCh + 1); + uint8_t remCh2 = XT_SALT(3, remInCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + xb_vecNx16U vecGatherOff2; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) + (y * stride) * inDataPitch1; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbX = IVP_LTRSN(16 * (numX + 1)); + vboolN vbY = IVP_LTRSN(16 * (numX + 1) * numY); + + for (ky = 0; ky < 4; ky++) /* Kernel Height Loop */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilation * inDataPitch1; + pData2 = pData1 + (stride * inDataPitch1 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + coeffPitch2); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 2 * coeffPitch2); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 3 * coeffPitch2); + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + + pData1 += 4 * inDataPitch2; + pData2 += 4 * inDataPitch2; + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff3, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 4 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff4, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX)); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY)); + + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 4 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels Corner case Handling */ + } /* End Kernel Height Loop */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 5x5 MOD_WHD_DWH 3D */ +/* dilated convolution function and 5x5 MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) > 0), \ + XAI_ERR_BADARG, "\nDilation = %hhu, value should be greater than zero", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE(inTile, 2 + 2 * (XAI_CNN_CONV_GET_DILATION(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + if (XAI_CNN_CONV_GET_DILATION(param) > 1) + { + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == 1, \ + XAI_ERR_BADARG, "\nStride = %hhu, Dilation = %hhu\nWhen dilation parameter is more than 1 stride always has to be 1", \ + XAI_CNN_CONV_GET_STRIDE(param), XAI_CNN_CONV_GET_DILATION(param)); + } + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 1) + { + /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 4 * dilation) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 4 * XAI_CNN_CONV_GET_DILATION(param)) / \ + XT_MIN(numInCh - 1, 3)), \ + XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 4 * XAI_CNN_CONV_GET_DILATION(param)) / \ + XT_MIN(numInCh - 1, 3))); + } + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t dilation = XAI_CNN_CONV_GET_DILATION(param); + const int32_t kSizeU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t dilatedKSize = dilation * (kSizeU - 1) + 1; + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-((dilatedKSize / 2) * inDataPitch1 + (dilatedKSize / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + + + /* 4 Gathers are being used to Load Input Data. Many common elements + * will be loaded in separate Gathers, especially in the case of + * stride 1 and 2. To take the advantage of having common offsets in + * a single Gather, 2 Gather Patterns are generated as given below. + * For example, in the Gather Patterns generated below, + * if stride is 1 and dilation equal to 1, then 8 offsets are common and if stride is 2, 4 offsets + * are common in each Gather. + */ + + /* Gather Index Calculations */ + xb_vecNx16 vecGather = IVP_SRLINX16(IVP_SEQNX16(), 2); + vecGather = IVP_MULNX16UPACKL(vecGather, (uint16_t) dilation); + IVP_MULANX16PACKL(vecGather, inDataPitch2, IVP_ANDNX16(IVP_SEQNX16(), 3)); + xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride); + + xb_vecNx16 vecSelIdx1 = IVP_SEQNX16(); + IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, (XCHAL_IVPN_SIMD_WIDTH - 12), IVP_NOTBN(IVP_LTRNI(12))); + xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1); + xb_vecNx16 vecSelIdx2 = IVP_ADDNX16(IVP_SEQNX16(), 12); + IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, (XCHAL_IVPN_SIMD_WIDTH - 12), IVP_NOTBN(IVP_LTRNI(8))); + xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2); + /* Index Pattern of vecGatherOff1 is - + * 0 P2 2*P2 3*P2 d*1 P2+d*1 2*P2+d*1 3*P2+d*1 d*2 P2+d*2 2*P2+d*2 3*P2+d*2 + * s s+P2 s+2*P2 s+3*P2 s+d*1 s+d*1+P2 s+d*1+2*P2 s+d*1+3*P2 */ + + /* Index Pattern of vecGatherOff2 is - + * d*3 P2+d*3 2*P2+d*3 3*P2+d*3 d*4 P2+d*4 2*P2+d*4 3*P2+d*4 s+d*2 s+d*2+P2 s+d*2+2*P2 s+d*2+3*P2 + * s+d*3 s+d*3+P2 s+d*3+2*P2 s+d*3+3*P2 s+d*4 s+d*4+P2 s+d*4+2*P2 s+d*4+3*P2 */ + + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remInCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint32_t sumMask = maskLut[0] + maskLut[1] * XT_SALT(2, remInCh + 1) + maskLut[2] * XT_SALT(3, remInCh + 1); + + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* 4 Gathers used for Input Data Load. Unrolled along */ + /* Output Width and Height by 2. Also, unrolled along */ + /* Input Channels by 4 and Kernel Width. */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Out Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used for corner case handling of Out Height odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff00; + xb_vecNx16U vecGatherOff01; + xb_vecNx16U vecGatherOff10; + xb_vecNx16U vecGatherOff11; + + /* Variable used for corner case handling of Out Width odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) + (y * stride) * inDataPitch1; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean Vectors for Predicate Gather with corner cases */ + /* handled for Out Width and Height being odd numbers */ + vboolN vb1 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(20 * numX)); + vboolN vb2 = IVP_ORBN(IVP_LTRNI(8), IVP_LTRSN(20 * numX)); + vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(20 * numY)); + vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(20 * numY)); + + for (ky = 0; ky < 5; ky++) /* Kernel Height */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilation * inDataPitch1; + pData2 = pData1 + (stride * inDataPitch1 * numY); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Gather Load of Input Data */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4); + + pData1 += 4 * inDataPitch2; + pData2 += 4 * inDataPitch2; + + /* kx = 1 */ + /* Extracting scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \ + coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 2); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 4 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 3); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 5 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 4 * coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (remInCh) + { + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1)); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2)); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3)); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4)); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + dvecData4 = IVP_GATHERD2NX8_L(gather4); + + /* kx = 1 */ + /* Extracting scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1)); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, \ + remInCh + 1)); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 2); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 4 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 3); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (XT_SALT(2, remInCh + 1) + XT_SALT(3, remInCh + 1)))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 5 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * XT_SALT(2, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * XT_SALT(3, remInCh + 1)); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels corner case handling */ + } /* End Kernel Height */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 7x7 MOD_WHD_DWH 3D */ +/* dilated convolution function and 7x7 MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATION(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDE(param) == 1)), XAI_ERR_BADARG, \ + "\nDilation = %hhu\nDilation should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE(inTile, 3 + 3 * (XAI_CNN_CONV_GET_DILATION(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 1) + { + /* Max value of Gather Offset is (min(numInCh-1,3)*inDataPitch2 + stride + 6*dilation) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 6 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3)), \ + XAI_ERR_BADARG, "dim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) - 6 * XAI_CNN_CONV_GET_DILATION(param)) / XT_MIN(numInCh - 1, 3))); + } + } + + /* Kernel Size (NDWH) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t dilation = XAI_CNN_CONV_GET_DILATION(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Effective Kernel size = dilation(KernelSize - 1) + 1 */ + /* Effective kernel size is used for calculating the min required edge */ + int32_t dilatedKSizeU = dilation * (kSizeU - 1) + 1; + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedKSizeU / 2) * inDataPitch1 + (dilatedKSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + + /* 4 Gathers are being used to Load Input Data. Many common elements + * will be loaded in separate Gathers, especially in the case of + * stride 1 and 2. To take the advantage of having common offsets in + * a single Gather, 2 Gather Patterns are generated as given below. + * For example, in the Gather Patterns generated below, + * if stride is 1 and dilation = 1, then 12 offsets are common and + * if stride is 2 and dilation = 1, 8 offsets are common in each Gather. + */ + /* Gather Index Calculations */ + xb_vecNx16 vecGather = IVP_MULNX16PACKL(dilation, IVP_SRLINX16(IVP_SEQNX16(), 2)); + IVP_MULANX16PACKL(vecGather, inDataPitch2, IVP_ANDNX16(IVP_SEQNX16(), 3)); + xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride); + + xb_vecNx16 vecSelIdx1 = IVP_SEQNX16(); + IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, (XCHAL_IVPN_SIMD_WIDTH - 16), IVP_NOTBN(IVP_LTRNI(16))); + xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1); + xb_vecNx16 vecSelIdx2 = IVP_ADDNX16(IVP_SEQNX16(), 16); + IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, (XCHAL_IVPN_SIMD_WIDTH - 16), IVP_NOTBN(IVP_LTRNI(12))); + xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2); + + /* Index Pattern of vecGatherOff1 is - + * 0 P2 2*P2 3*P2 1*d P2+1*d 2*P2+1*d 3*P2+1*d 2*d P2+2*d 2*P2+2*d 3*P2+2*d 3*d P2+3*d 2*P2+3*d 3*P2+3*d + * s s+P2 s+2*P2 s+3*P2 s+1*d s+1*d+P2 s+1*d+2*P2 s+1*d+3*P2 s+2*d s+2*d+P2 s+2*d+2*P2 s+2*d+3*P2 */ + + /* Index Pattern of vecGatherOff2 is - + * 4*d P2+4*d 2*P2+4*d 3*P2+4*d 5*d P2+5*d 2*P2+5*d 3*P2+5*d 6*d P2+6*d 2*P2+6*d 3*P2+6*d s+3*d s+3*d+P2 s+3*d+2*P2 s+3*d+3*P2 + * s+4*d s+4*d+P2 s+4*d+2*P2 s+4*d+3*P2 s+5*d s+5*d+P2 s+5*d+2*P2 s+5*d+3*P2 s+6*d s+6*d+P2 s+6*d+2*P2 s+6*d+3*P2 */ + + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remInCh = numInCh & 3; + + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remInCh + 1); + uint8_t remCh2 = XT_SALT(3, remInCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* 4 Gathers are used for Input Data Load corresponding to 4 */ + /* Output Vectors. Loop unrolled along Output Width and Height by 2. */ + /* Also unrolled along Input Channels by 4 and Kernel Width. */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used for corner case handling of Out Height odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff00; + xb_vecNx16U vecGatherOff01; + xb_vecNx16U vecGatherOff10; + xb_vecNx16U vecGatherOff11; + /* Variable used for corner case handling of Out Width odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) + (y * stride) * inDataPitch1; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + + /* Boolean Vectors for Predicate Gather with corner cases */ + /* handled for Out Width and Height being odd numbers */ + vboolN vb1 = IVP_ORBN(IVP_LTRNI(16), IVP_LTRSN(28 * numX)); + vboolN vb2 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(28 * numX)); + vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(28 * numY)); + vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(28 * numY)); + + for (ky = 0; ky < 7; ky++) /* Kernel Height */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * inDataPitch1 * dilation; + pData2 = pData1 + (stride * inDataPitch1 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Number of Input Channels */ + { + /* Gathers for Input Loads */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4); + + pData1 += 4 * inDataPitch2; + pData2 += 4 * inDataPitch2; + + /* kx = 1 */ + /* Extracting Scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 4); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 4 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 5 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 6 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 7 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 6 * coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (remInCh) + { + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1)); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2)); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3)); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4)); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + dvecData4 = IVP_GATHERD2NX8_L(gather4); + + + /* kx = 1 */ + /* Extracting scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 4); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 4 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 5 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 6 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 7 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + /* Masking the qmulScalar values to avoid accumulation with unintended values */ + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels corner case handling */ + } /* End Kernel Height */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_WHD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_WHD_DWH 3D */ +/* dilated convolution function and MxN MOD_WHD_DWH 3D VQ */ +/* dilated convolution function */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structur */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input is in WHD and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 16) && \ + (XAI_TILE4D_GET_DIM4(coeffTile) <= 16), \ + XAI_ERR_KSIZE, "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDE(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDE(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 1) + { + /* Max value of Gather Offset is (min(numInCh-1,7)*inDataPitch + stride*min(3,outWidth-1)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_PITCH(inTile) < \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7)), \ + XAI_ERR_BADARG, "\ndim2Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM2_PITCH(inTile), \ + ((USHRT_MAX - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(3, outW - 1)) / XT_MIN(numInCh - 1, 7))); + } + } + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideU = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + int32_t k; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* The loop across kernel width and kernel height can be combined. In this */ + /* case the address offsets for input and coefficient need to be derived */ + /* from vector registers. These vector registers are initialized as follows */ + + xb_vecN_2x32v hvecCoeffAddrOffInit = IVP_PACKVRNRN_2X64W(IVP_MULN_2X16X32_0 \ + (IVP_MOVNX16_FROMN_2X32(IVP_SEQN_2X32()), coeffPitch2), 0); + + xb_vecN_2x32v hvecInAddrOffInit = IVP_PACKVRNRN_2X64W(IVP_MULHN_2X16X32_1 \ + ((xb_vecNx16) dilationX, IVP_SEQN_2X32()), 16); + + /* This implementation uses one gather operation to load 4 bytes of data each from 8 channels */ + + /***** Gather Offset Computation (used inside InCh for-loop) *****/ + /* InCh for-loop is executed when inCh>8 */ + /* */ + /* offset = pitch*[0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] + */ + /* stride*[0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3] */ + /* where [0 0 0 0 0 0 0 0 ... 3 3 3 3 3 3 3 3] =>> column indices */ + /* [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] =>> channel indices */ + xb_vecNx16U vecOffsets0 = IVP_ADDNX16(IVP_MULNX16PACKL(IVP_ANDNX16(7, IVP_SEQNX16()), inDataPitch2), \ + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 3), strideU)); + + /******* Gather Offset Computation and Coeff Mask (outside InCh for-loop) ********/ + + /* ((numInCh>>3)<<3) = largest multiple of 8 less numInCh-8 */ + /* Loop across inCh is executed only when numInCh > 8 */ + int32_t remainingInCh = (numInCh - ((numInCh >> 3) << 3)); + remainingInCh = remainingInCh != 0 ? remainingInCh : 8; + + /* Generating Coefficient mask such that coefficient load happens only for valid channel number*/ + /* Coefficient mask entries for channels greater than the remainingInCh are set to 0 */ + uint8_t remCh1 = XT_SALT(1, remainingInCh); + uint8_t remCh2 = XT_SALT(2, remainingInCh); + uint8_t remCh3 = XT_SALT(3, remainingInCh); + uint8_t remCh4 = XT_SALT(4, remainingInCh); + uint8_t remCh5 = XT_SALT(5, remainingInCh); + uint8_t remCh6 = XT_SALT(6, remainingInCh); + uint8_t remCh7 = XT_SALT(7, remainingInCh); + + /*Generation of maskLut for handling cases when remainingInCh is not equal to 0 */ + /*eg. if remainingInCh is equal to 2 then sumMask1 is 00FFFFFF and sumMask2 is 0 */ + /* if remainingInCh is equal to 3 then sumMask1 is FFFFFFFF and sumMask2 is 0 */ + /* if remainingInCh is equal to 4 then sumMask1 is FFFFFFFF and sumMask2 is FF */ + const uint32_t maskLut[4] = { 0xff, 0xff00, 0xff0000, 0xff000000 }; + + int32_t sumMask1 = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2 + maskLut[3] * remCh3; + int32_t sumMask2 = maskLut[0] * remCh4 + maskLut[1] * remCh5 + maskLut[2] * remCh6 + maskLut[3] * remCh7; + + /* Finding the gather offset such that valid memory locations are accessed */ + /* [0 1 2 3 4 5 6 7 ... 0 1 2 3 4 5 6 7] in offset calculation is modified such */ + /* that columns greater than (remainingInCh-1) are set to (remainingInCh-1) */ + xb_vecNx16 vecRemainingInChIdx = IVP_MINNX16(IVP_ANDNX16(7, IVP_SEQNX16()), remainingInCh - 1); + xb_vecNx16U vecOffsets1 = IVP_ADDNX16(IVP_MULNX16PACKL(vecRemainingInChIdx, inDataPitch2), \ + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 3), strideU)); + + /** Output width is unrolled by 4 and Input Channels is unrolled by 8 **/ + + /********* Loop Starts ************/ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Along output channels*/ + { + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y++) /* Along output height*/ + { + xb_vecNx16U vecOffsets2; + xb_vecNx16U vecOffsets3; + for (x = 0; x < outW; x += 4) /*Along output width*/ + { + /* For corner case handling */ + int32_t remainingX = XT_MIN(4, outW - x); + vboolN vbOffsetMask = IVP_LTRSN(8 * remainingX); /* 8 channels*/ + /* Assign valid address for predicated false lines */ + vecOffsets2 = IVP_MOVNX16UT(vecOffsets0, 0, vbOffsetMask); + vecOffsets3 = IVP_MOVNX16UT(vecOffsets1, 0, vbOffsetMask); + + /* Output pointer */ + int8_t* pOut = &pOutData[(y * outDataPitch2 + x * outDataPitch1) * bytesPerPixel]; + + /* Loading bias and initializing sum with bias*/ + xb_vec2Nx24 dvecSum0 = 0, dvecSum1 = 0, dvecSum2 = 0, dvecSum3 = 0; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, dvecSum0, dvecSum1, dvecSum2, dvecSum3); + + /* Input Data and Coeff Data Pointers */ + int8_t *pSrc1 = pInData + x * strideU + y * strideU * inDataPitch1; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = hvecInAddrOffInit; + xb_vecN_2x32v hvecCoeffAddrOff = hvecCoeffAddrOffInit; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t index, inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch1 * dilationY, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3, vbN_2); + index = IVP_EXTRN_2X32(hvecLaneIdx, 0); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRVRN_2X32(hvecInAddrOff, 4 * index); + coeffAddrOff = IVP_EXTRVRN_2X32(hvecCoeffAddrOff, 4 * index); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + + /* Pointers for Input Data Loads */ + int8_t *pSrc = (pSrc1 + inAddrOff); + + /* Pointer for Coefficient Load */ +#ifdef IS_VISION_130 + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + xb_vec2Nx8* pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff + 4 * coeffPitch1 + coeffAddrOff); + + for (inCh = 0; inCh < (numInCh - 8); inCh += 8) + { + /* Gather Operation to load 8 channels of 1x4 block of input . dvecIn will contain data */ + /* from 8 channels corresponding to same x and y value in consecutive positions. */ + xb_gsr gatherReg = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets2); + xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/ + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_L2U2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff1; + IVP_L2U2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff2; + IVP_L2U2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff3; + IVP_L2U2NX8_XP(dvecCoeff3, pdvecCoeff, 5 * coeffPitch1); + + xb_vec2Nx8 dvecCoeff4; + IVP_L2U2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1); + + xb_vec2Nx8 dvecCoeff5; + IVP_L2U2NX8_XP(dvecCoeff5, pdvecCoeff1, coeffPitch1); + + xb_vec2Nx8 dvecCoeff6; + IVP_L2U2NX8_XP(dvecCoeff6, pdvecCoeff1, coeffPitch1); + + xb_vec2Nx8 dvecCoeff7; + IVP_L2U2NX8_XP(dvecCoeff7, pdvecCoeff1, 5 * coeffPitch1); + + /* Load 4 bytes of input data along the depth to int32_t scalar */ + int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 1); + + int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 2); + int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 3); + + int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 4); + int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 5); + + int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 6); + int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 7); + + /* Multiply and accumulate */ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6); + + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7); + } /* end of for(inCh = 0; inCh < (numInCh-8); inCh+=8)*/ + + /*Gather Operation to load remainingCh number of channels corresponding to 1x4 block */ + /*of input. The channels to be loaded are handled by vecOffsets1 */ + xb_gsr gatherReg = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets3); + xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/ + + /* Load 4 bytes of input data along the depth to int32_t scalar */ + int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 1); + + int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 2); + int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 3); + + int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 4); + int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 5); + + int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 6); + int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 7); + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1 * remCh1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh2); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh3); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1 * remCh4); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 * remCh5); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1 * remCh6); + + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1 * remCh7); + + xb_vec2Nx8 dvecCoeff7; + IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1); + + /* Multiply and accumulate */ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1); + + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2); + } /* end of for (k = 0; k < kHeightU * kWidthU; k++)*/ + +#else + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + for (inCh = 0; inCh < (numInCh - 8); inCh += 8) + { + /* Gather Operation to load 8 channels of 1x4 block of input . dvecIn will contain data */ + /* from 8 channels corresponding to same x and y value in consecutive positions. */ + xb_gsr gatherReg = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets2); + xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/ + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1); + + xb_vec2Nx8 dvecCoeff7; + IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1); + + /* Load 4 bytes of input data along the depth to int32_t scalar */ + int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 1); + + int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 2); + int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 3); + + int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 4); + int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 5); + + int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 6); + int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 7); + + /* Multiply and accumulate */ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6); + + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7); + } /* end of for(inCh = 0; inCh < (numInCh-8); inCh+=8)*/ + + /*Gather Operation to load remainingCh number of channels corresponding to 1x4 block */ + /*of input. The channels to be loaded are handled by vecOffsets1 */ + xb_gsr gatherReg = IVP_GATHERANX8S(pSrc + inCh * inDataPitch2, vecOffsets3); + xb_vec2Nx8 dvecIn = IVP_GATHERD2NX8_L(gatherReg); /* LSB 8 bits of gatherReg contain the desired data*/ + + /* Load 4 bytes of input data along the depth to int32_t scalar */ + int32_t scalarInData0 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 0); + int32_t scalarInData1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 1); + + int32_t scalarInData2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 2); + int32_t scalarInData3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 3); + + int32_t scalarInData4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 4); + int32_t scalarInData5 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 5); + + int32_t scalarInData6 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 6); + int32_t scalarInData7 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecIn)), 7); + + /* 8 Coefficient Vector Loads */ + /* Load Coefficients to vector - coefficients already aligned */ + xb_vec2Nx8 dvecCoeff0; + IVP_LV2NX8_XP(dvecCoeff0, pdvecCoeff, coeffPitch1 * remCh1); + + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh2); + + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh3); + + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1 * remCh4); + + xb_vec2Nx8 dvecCoeff4; + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 * remCh5); + + xb_vec2Nx8 dvecCoeff5; + IVP_LV2NX8_XP(dvecCoeff5, pdvecCoeff, coeffPitch1 * remCh6); + + xb_vec2Nx8 dvecCoeff6; + IVP_LV2NX8_XP(dvecCoeff6, pdvecCoeff, coeffPitch1 * remCh7); + + xb_vec2Nx8 dvecCoeff7; + IVP_LV2NX8_XP(dvecCoeff7, pdvecCoeff, coeffPitch1); + + /* Multiply and accumulate */ + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData0 & sumMask1); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData2 & sumMask1); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData4 & sumMask1); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff3, dvecCoeff2, dvecCoeff1, dvecCoeff0, scalarInData6 & sumMask1); + + IVP_MULQA2N8XR8(dvecSum0, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData1 & sumMask2); + IVP_MULQA2N8XR8(dvecSum1, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData3 & sumMask2); + IVP_MULQA2N8XR8(dvecSum2, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData5 & sumMask2); + IVP_MULQA2N8XR8(dvecSum3, dvecCoeff7, dvecCoeff6, dvecCoeff5, dvecCoeff4, scalarInData7 & sumMask2); + } /* end of for (k = 0; k < kHeightU * kWidthU; k++)*/ +#endif + + /* Storing output vector to memory */ + xb_vec2Nx8 dvecOutData0L = 0, dvecOutData1L = 0, dvecOutData2L = 0, dvecOutData3L = 0; + xb_vec2Nx8 dvecOutData0H = 0, dvecOutData1H = 0, dvecOutData2H = 0, dvecOutData3H = 0; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, dvecSum0, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData1L, dvecOutData1H, dvecSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData2L, dvecOutData2H, dvecSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData3L, dvecOutData3H, dvecSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + pdvecOut = (xb_vec2Nx8 *) &pOut[outCh * bytesPerPixel]; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOutData0H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 1)]; + IVP_SAV2NX8_XP(dvecOutData1L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 1)); + IVP_SAV2NX8_XP(dvecOutData1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 1)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 2 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 2)]; + IVP_SAV2NX8_XP(dvecOutData2L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 2)); + IVP_SAV2NX8_XP(dvecOutData2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 2)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pdvecOut = (xb_vec2Nx8 *) &pOut[(outCh + 3 * outDataPitch1) * bytesPerPixel * XT_SALT(0, remainingX - 3)]; + IVP_SAV2NX8_XP(dvecOutData3L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * XT_SALT(0, remainingX - 3)); + IVP_SAV2NX8_XP(dvecOutData3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * XT_SALT(0, remainingX - 3)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* end of for(x = 0; x < outW; x+=4)*/ + } /* end of for(y = 0; y < outH; y++)*/ + } /* end of for(outCh = 0; outCh < numOutCh; outCh+=2*XCHAL_IVPN_SIMD_WIDTH)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MOD DWH variants +******************************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution for handling */ +/* cases where kwidth * numInch is a multiple of 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + int32_t numIter = kWidthU * numInCh; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner case handling as numIter is not a multiple of 4 */ + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************************** +* MOD DWH variants +******************************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution for handling */ +/* cases where kwidth * numInch is a multiple of 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ +#ifdef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + int32_t numIter = kWidthU * numInCh; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1))); + xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2))); + xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3))); + xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4); +#endif + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} +#endif + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ +#ifdef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1))); + xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2))); + xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3))); + xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4); +#endif + } /* End Input Channels */ + + /* Corner case handling as numIter is not a multiple of 4 */ + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1))); + xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2))); + xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3))); + xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0); +#endif + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} +#endif + +/***************************************************************************** +* convolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH_x4 +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution for handling */ +/* cases where kwidth * numInch is a multiple of 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_1x1_S8S8IXCa2_MOD_DWH_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Unrolled by 2 along both Output Width and Height. + * Inner loop unrolled by 4 along the Input number of Channels. + * Input Number of Channels less than 4 handled in a + * separate loop. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Corner case Handling if height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Corner case Handling if width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Pointer for Coefficient Load */ + int8_t *pCoeff = pCoeffData + outCh; + pdvecCoeff = (xb_vec2Nx8 *) pCoeff; + + /* Input Data Pointers */ + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + pdvecData1 = (xb_vec2Nx8 *) pData; + pdvecData2 = (xb_vec2Nx8 *) (pData + stride * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + stride * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY); + + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (inCh = 0; inCh < numInCh; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + /* Quad Muls */ + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + pdvecOut = (xb_vec2Nx8 *) pOut; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pOut = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/***************************************************************************** +* convolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH_x4 +* **************************************************************************/ +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D convolution for handling */ +/* cases where kwidth * numInch is a multiple of 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_1x1_U8S8IXCa2_MOD_DWH_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, k, x, y; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Vector data registers */ + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + valign vaIn1, vaIn2, vaIn3, vaIn4; + + /* Unrolled by 2 along both Output Width and Height. + * Inner loop unrolled by 4 along the Input number of Channels. + * Input Number of Channels less than 4 handled in a + * separate loop. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Corner case Handling if height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Corner case Handling if width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Pointer for Coefficient Load */ + int8_t *pCoeff = pCoeffData + outCh; + pdvecCoeff = (xb_vec2Nx8 *) pCoeff; + + /* Input Data Pointers */ + uint8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + pdvecData1 = (xb_vec2Nx8U *) pData; + pdvecData2 = (xb_vec2Nx8U *) (pData + stride * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + stride * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY); + + vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + vaIn2 = IVP_LA2NX8U_PP(pdvecData2); + vaIn3 = IVP_LA2NX8U_PP(pdvecData3); + vaIn4 = IVP_LA2NX8U_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numInCh; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Load 4 bytes of input data */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Corner case handling */ + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + pdvecOut = (xb_vec2Nx8 *) pOut; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pOut = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 1x1 MOD_DWH 3D */ +/* dilated convolution function and 1x1 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + if (numInCh % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_1x1_S8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Unrolled by 2 along both Output Width and Height. + * Inner loop unrolled by 4 along the Input number of Channels. + * Input Number of Channels less than 4 handled in a + * separate loop. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Corner case Handling if height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Corner case Handling if width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Pointer for Coefficient Load */ + int8_t *pCoeff = pCoeffData + outCh; + pdvecCoeff = (xb_vec2Nx8 *) pCoeff; + + /* Input Data Pointers */ + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + pdvecData1 = (xb_vec2Nx8 *) pData; + pdvecData2 = (xb_vec2Nx8 *) (pData + stride * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + stride * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY); + + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + /* Quad Muls */ + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner Case Handling as No. of Input Channels not multiple of 4 */ + { + int32_t remInCh = numInCh - inCh; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Corner case handling */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + pdvecOut = (xb_vec2Nx8 *) pOut; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pOut = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 1x1 MOD_DWH 3D */ +/* dilated convolution function and 1x1 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + if (numInCh % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_1x1_U8S8IXCa2_MOD_DWH_x4(inTile, coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, k, x, y; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Vector data registers */ + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + valign vaIn1, vaIn2, vaIn3, vaIn4; + + /* Unrolled by 2 along both Output Width and Height. + * Inner loop unrolled by 4 along the Input number of Channels. + * Input Number of Channels less than 4 handled in a + * separate loop. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Corner case Handling if height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Corner case Handling if width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Pointer for Coefficient Load */ + int8_t *pCoeff = pCoeffData + outCh; + pdvecCoeff = (xb_vec2Nx8 *) pCoeff; + + /* Input Data Pointers */ + uint8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + pdvecData1 = (xb_vec2Nx8U *) pData; + pdvecData2 = (xb_vec2Nx8U *) (pData + stride * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + stride * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + stride * (inDataPitch1 + inDataPitch2) * numX * numY); + + vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + vaIn2 = IVP_LA2NX8U_PP(pdvecData2); + vaIn3 = IVP_LA2NX8U_PP(pdvecData3); + vaIn4 = IVP_LA2NX8U_PP(pdvecData4); + + for (k = 0; k < numInCh - 3; k += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Input Channels */ + + /* Corner Case Handling as No. of Input Channels not multiple of 4 */ + { + int32_t remInCh = numInCh - k; + + /* Aligning variable vector load of pixels */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0); +#endif + } /* End Corner case handling */ + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + pdvecOut = (xb_vec2Nx8 *) pOut; + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + y * outDataPitch2) * bytesPerPixel * numX; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pOut = pOutData + (outCh + x * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pOut = pOutData + (outCh + (x + 1) * outDataPitch1 + (y + 1) * outDataPitch2) * bytesPerPixel * numX * numY; + pdvecOut = (xb_vec2Nx8 *) pOut; + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 2x2 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 2x2 MOD_DWH 3D */ +/* dilated convolution function and 2x2 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 2x2xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + /* Max value of Gather Offset is ((stride*min(1, outW-1) + dilation) * inDataPitch1 + + * min(3, numInCh - 1) + ((stride*min(1, outH-1) * inDataPitch2)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) < \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1) - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outH - 1) * \ + XAI_TILE3D_GET_DIM2_PITCH(inTile)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + XAI_CNN_CONV_GET_DILATION(param))), \ + XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM1_PITCH(inTile), \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1) - XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outH - 1) * \ + XAI_TILE3D_GET_DIM2_PITCH(inTile)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + XAI_CNN_CONV_GET_DILATIONX(param)))); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + if ((dilatedKWidth % 2) != 0) + { + leftEdge = dilatedKWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidth / 2) : ((dilatedKWidth / 2) - 1); + } + + if ((dilatedKHeight % 2) != 0) + { + topEdge = dilatedKHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeight / 2) : ((dilatedKHeight / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + valign vaOutData = IVP_ZALIGN(); + + /* Only 1 Gather is used in this approach to get the + * Input Data for 4 Output Vectors. In every Gather, + * 32 elements are read, where first 16 of them correspond + * to two vectors of Output along the width and the other + * 16 of them correspond to two vectors of Output along the height. + * To get the index values for the Gather, the following + * calculations are made. + */ + + /* Gather Index Calculations */ + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4 5 6 7 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 24, IVP_NOTBN(IVP_LTRNI(8))); + /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */ + xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2); + /* Sequence - 0 1 2 3 d*P1 d*P1+1 d*P1+2 d*P1+3 */ + IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \ + vecGatherOff, vecSelIdx); + + xb_vecNx16 vecSelIdx2 = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, 16, IVP_NOTBN(IVP_LTRNI(16))); + + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch2), \ + vecGatherOff, vecSelIdx2); + + /* Final Index Pattern is - + * + * First 8 elements : + * 0 1 2 3 + * d*P1 d*P1+1 d*P1+2 d*P1+3 + * + * Second 8 elements : + * s*P1 s*P1+1 s*P1+2 s*P1+3 + * (s+d)*P1 (s+d)*P1+1 (s+d)*P1+2 (s+d)*P1+3 + * + * Third 8 elements : + * 0+(s*P2) 1+(s*P2) 2+(s*P2) 3+(s*P2) + * d*P1+(s*P2) d*P1+1+(s*P2) d*P1+2+(s*P2) d*P1+3+(s*P2) + * + * Last 8 elements : + * s*P1+(s*P2) s*P1+1+(s*P2) s*P1+2+(s*P2) s*P1+3+(s*P2) + * (s+d)*P1+(s*P2) (s+d)*P1+1+(s*P2) (s+d)*P1+2+(s*P2) (s+d)*P1+3+(s*P2) + * + */ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remCh + 1); + uint8_t remCh2 = XT_SALT(3, remCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbXY = IVP_LTRSN((16 * numY) + 8 * (numX + 1)); + + /* Pointer for Coefficient Load */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3); + + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbXY); + + /* Pointer for Input Data Load corresponding to ky = 0 */ + pData1 = pData; + + /* Pointer for Input Data Load corresponding to ky = 1 */ + pData2 = pData1 + (dilationY * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data correspoinding to ky = 0 */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + + /* Gather Input Data corresponding to ky = 1 */ + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff1); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* ky = 0, kx = 0 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch2 - \ + 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* ky = 0, kx = 1 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1 - coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* ky = 1, kx = 0 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* ky = 1, kx = 1 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch1 - coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + pData1 += 4; + pData2 += 4; + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + + /* Pointer for Input Data Load corresponding to ky = 0 */ + pData1 = pData + inCh; + + /* Pointer for Input Data Load corresponding to ky = 1 */ + pData2 = pData1 + (dilationY * inDataPitch2); + + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbXY)); + + /* Gather Input Data corresponding to ky = 0*/ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + + /* Gather Input Data corresponding to ky = 1 */ + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff1); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* ky = 0, kx = 0 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* ky = 0, kx = 1 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* ky = 1, kx = 0 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* ky = 1, kx = 1 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels Corner case Handling */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 3x3 MOD_DWH 3D */ +/* dilated convolution function and 3x3 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE2(inTile, 1 + 1 * (XAI_CNN_CONV_GET_DILATIONX(param) - 1), 1 + 1 * (XAI_CNN_CONV_GET_DILATIONY(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + /* Max value of Gather Offset is ((stride*min(1, outW-1) + 2 * dilationX) * inDataPitch1 + + * min(3, numInCh - 1)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) < \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param))), \ + XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM1_PITCH(inTile), \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param)))); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-(int32_t) ((dilatedKHeight / 2) * inDataPitch2 + (dilatedKWidth / 2) * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + valign vaOutData = IVP_ZALIGN(); + + /* Only 2 Gathers are used in this approach to get the + * Input Data for 4 Output Vectors. In each Gather, + * 24 elements are read, where each 12 of them correspond + * to one vector of Output along the width. To get the + * index values for the Gather, the following calculations + * are made. + */ + + /* Gather Index Calculations */ + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 20, IVP_NOTBN(IVP_LTRNI(12))); + /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */ + xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2); + /* Sequence - 0 1 2 3 d*P1 d*P1+1 d*P1+2 d*P1+3 2.d*P1 2.d*P1+1 2.d*P1+2 2.d*P1+3 ... */ + IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \ + vecGatherOff, vecSelIdx); + /* Final Index Pattern is - + * 0 1 2 3 d*P1 d*p1+1 d*P1+2 d*P1+3 d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3 + * s*P1 s*P1+1 s*P1+2 s*P1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3 + * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 */ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remCh + 1); + uint8_t remCh2 = XT_SALT(3, remCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + xb_vecNx16U vecGatherOff2; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbX = IVP_LTRSN(12 * (numX + 1)); + vboolN vbY = IVP_LTRSN(12 * (numX + 1) * numY); + + for (ky = 0; ky < 3; ky++) /* Kernel Height Loop */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilationY * inDataPitch2; + pData2 = pData1 + (stride * inDataPitch2 * numY); + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + + + pData1 += 4; + pData2 += 4; + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \ + coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 2 * coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX)); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY)); + + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2)); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels Corner case Handling */ + } /* End Kernel Height Loop */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_3x3_U8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 3x3 MOD_DWH 3D */ +/* dilated convolution function and 3x3 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE2(inTile, 1 + 1 * (XAI_CNN_CONV_GET_DILATIONX(param) - 1), 1 + 1 * (XAI_CNN_CONV_GET_DILATIONY(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + +#ifdef IVP_MULSUQA2N8XR8 + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } +#endif //#ifdef IVP_MULSUQA2N8XR8 + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + /* Max value of Gather Offset is ((stride*min(1, outW-1) + 2 * dilationX) * inDataPitch1 + + * min(3, numInCh - 1)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) < \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param))), \ + XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM1_PITCH(inTile), \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 2 * XAI_CNN_CONV_GET_DILATION(param)))); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-(int32_t) ((dilatedKHeight / 2) * inDataPitch2 + (dilatedKWidth / 2) * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + valign vaOutData = IVP_ZALIGN(); + + /* Only 2 Gathers are used in this approach to get the + * Input Data for 4 Output Vectors. In each Gather, + * 24 elements are read, where each 12 of them correspond + * to one vector of Output along the width. To get the + * index values for the Gather, the following calculations + * are made. + */ + + /* Gather Index Calculations */ + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4...11 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 20, IVP_NOTBN(IVP_LTRNI(12))); + /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */ + xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2); + /* Sequence - 0 1 2 3 d*P1 d*P1+1 d*P1+2 d*P1+3 2.d*P1 2.d*P1+1 2.d*P1+2 2.d*P1+3 ... */ + IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \ + vecGatherOff, vecSelIdx); + /* Final Index Pattern is - + * 0 1 2 3 d*P1 d*p1+1 d*P1+2 d*P1+3 d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3 + * s*P1 s*P1+1 s*P1+2 s*P1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3 + * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 */ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + uint8_t* restrict pData1; + uint8_t* restrict pData2; + + int32_t remCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + uint8_t remCh1 = XT_SALT(2, remCh + 1); + uint8_t remCh2 = XT_SALT(3, remCh + 1); + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + xb_vecNx16U vecGatherOff2; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + uint8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbX = IVP_LTRSN(12 * (numX + 1)); + vboolN vbY = IVP_LTRSN(12 * (numX + 1) * numY); + + for (ky = 0; ky < 3; ky++) /* Kernel Height Loop */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilationY * inDataPitch2; + pData2 = pData1 + (stride * inDataPitch2 * numY); + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data */ + xb_gsr gather1 = IVP_GATHERANX8U(pData1, vecGatherOff1); + xb_vec2Nx8U dvecData1 = IVP_GATHERD2NX8U_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8U(pData2, vecGatherOff2); + xb_vec2Nx8U dvecData2 = IVP_GATHERD2NX8U_L(gather2); + + + pData1 += 4; + pData2 += 4; + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \ + coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1))); + xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2))); + xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3))); + xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4); +#endif + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1))); + dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2))); + dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3))); + dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4); +#endif + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 2 * coeffPitch2); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1))); + dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2))); + dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3))); + dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), dvecCoeff4); +#endif + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8U dvecData1 = 0; + xb_vec2Nx8U dvecData2 = 0; + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX)); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY)); + + xb_gsr gather1 = IVP_GATHERANX8U(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8U_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8U(pData2, vecGatherOff2); + dvecData2 = IVP_GATHERD2NX8U_L(gather2); + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 3); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); +#else + xb_vec2Nx8U dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1 & sumMask))); + xb_vec2Nx8U dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2 & sumMask))); + xb_vec2Nx8U dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3 & sumMask))); + xb_vec2Nx8U dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4 & sumMask))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0); +#endif + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2)); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); +#else + dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1 & sumMask))); + dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2 & sumMask))); + dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3 & sumMask))); + dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4 & sumMask))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0); +#endif + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8U(dvecData2)), \ + 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); +#else + dvecS1 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar1 & sumMask))); + dvecS2 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar2 & sumMask))); + dvecS3 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar3 & sumMask))); + dvecS4 = IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(qmulScalar4 & sumMask))); + + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 0), dvecCoeff1, IVP_REP2NX8U(dvecS1, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, IVP_REP2NX8U(dvecS1, 2), dvecCoeff3, IVP_REP2NX8U(dvecS1, 3), 0); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 0), dvecCoeff1, IVP_REP2NX8U(dvecS2, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, IVP_REP2NX8U(dvecS2, 2), dvecCoeff3, IVP_REP2NX8U(dvecS2, 3), 0); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 0), dvecCoeff1, IVP_REP2NX8U(dvecS3, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, IVP_REP2NX8U(dvecS3, 2), dvecCoeff3, IVP_REP2NX8U(dvecS3, 3), 0); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 0), dvecCoeff1, IVP_REP2NX8U(dvecS4, 1), dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, IVP_REP2NX8U(dvecS4, 2), dvecCoeff3, IVP_REP2NX8U(dvecS4, 3), 0); +#endif + } /* End Input Channels Corner case Handling */ + } /* End Kernel Height Loop */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 4x4 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 4x4 MOD_DWH 3D */ +/* dilated convolution function and 4x4 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 4x4xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "Stride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && dilationX == 1 && dilationY == 1) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, + coeffTile, + biasArray, + outputScaleArray, + outTile, + param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, + coeffTile, + biasArray, + outTile, + param); +#endif + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + + /* Max value of Gather Offset is ((stride*min(outW-1, 1) + 3 * dilationX) * inDataPitch1 + + * min(3, numInCh - 1)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) < \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 3 * XAI_CNN_CONV_GET_DILATION(param))), XAI_ERR_BADARG, \ + "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM1_PITCH(inTile), \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 3 * XAI_CNN_CONV_GET_DILATION(param)))); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + if ((dilatedKWidth % 2) != 0) + { + leftEdge = dilatedKWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidth / 2) : ((dilatedKWidth / 2) - 1); + } + + if ((dilatedKHeight % 2) != 0) + { + topEdge = dilatedKHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeight / 2) : ((dilatedKHeight / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + valign vaOutData = IVP_ZALIGN(); + + /* Only 2 Gathers are used in this approach to get the + * Input Data for 4 Output Vectors. In each Gather, + * 32 elements are read, where each 16 of them correspond + * to one vector of Output along the width. To get the + * index values for the Gather, the following calculations + * are made. + */ + + /* Gather Index Calculations */ + /* Sequence - 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ... */ + xb_vecNx16U vecGatherOff = IVP_ANDNX16(IVP_SEQNX16(), 3); + xb_vecNx16 vecSelIdx = IVP_SEQNX16(); + /* To get the Select indexes as - 0 1 2 3 4...11 12 13 14 15 32 33 34 35 36.... */ + IVP_ADDNX16T(vecSelIdx, vecSelIdx, 16, IVP_NOTBN(IVP_LTRNI(16))); + /* To get - 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 ... */ + xb_vecNx16 vecSeqDiv4 = IVP_SRLINX16(IVP_SEQNX16(), 2); + /* Sequence - 0 1 2 3 d*P1 d*P1+1 d*P1+2 d*P1+3 2.d*P1 2.d*P1+1 2.d*P1+2 2.d*P1+3 + 3.d*P1 3.d*P1+1 3.d*P1+2 3.d*P1+3 ... */ + IVP_MULANX16PACKL(vecGatherOff, vecSeqDiv4, dilationX * inDataPitch1); + vecGatherOff = IVP_SELNX16(IVP_ADDNX16(vecGatherOff, stride * inDataPitch1), \ + vecGatherOff, vecSelIdx); + + /* Final Index Pattern is : + * + * First 16 elements : + * 0 1 2 3 + * d*1*P1 d*1*P1+1 d*1*P1+2 d*1*P1+3 + * d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3 + * d*3*P1 d*3*P1+1 d*3*P1+2 d*3*P1+3 + * + * Last 16 elements : + * s*P1 s*P1+1 s*P1+2 s*P1+3 + * (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3 + * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 + * (s+3*d)*P1 (s+3*d)*P1+1 (s+3*d)*P1+2 (s+3*d)*P1+3 + * + */ + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecCoeff3; + xb_vec2Nx8* restrict pdvecCoeff4; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remCh + 1); + uint8_t remCh2 = XT_SALT(3, remCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* Unrolled by 2 along both Output Width and Output Height. + * Also, unrolled along Input Channels by 4 and completely + * along the Kernel Width. Gathers are used for loading Input Data. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used to handle the corner case of OutHeight being odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff1; + xb_vecNx16U vecGatherOff2; + + /* Variable used to handle the corner case of Output Width being odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean vectors to handle the corner cases of Out Width and Height being odd */ + vboolN vbX = IVP_LTRSN(16 * (numX + 1)); + vboolN vbY = IVP_LTRSN(16 * (numX + 1) * numY); + + for (ky = 0; ky < 4; ky++) /* Kernel Height Loop */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilationY * inDataPitch2; + pData2 = pData1 + (stride * inDataPitch2 * numY); + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, vbX); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, vbY); + + /* Pointer for Coefficient Load */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + coeffPitch2); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 2 * coeffPitch2); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3 + 3 * coeffPitch2); + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels Loop */ + { + /* Gather Input Data */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + + + pData1 += 4; + pData2 += 4; + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff1, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff2, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff3, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 4 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff4, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + /* Assign valid address for predicated false lines */ + vecGatherOff1 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbX)); + vecGatherOff2 = IVP_MOVNX16UT(vecGatherOff, 0, IVP_ANDBN(vbRemInCh, vbY)); + + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff1); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData2, vecGatherOff2); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + + /* kx = 1 */ + /* Extracting scalar integers for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff1, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff1, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff1, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff2, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff2, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff2, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff3, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff3, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff3, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 4 */ + /* Extracting scalar integers for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 7); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 7); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff4, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff4, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff4, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels Corner case Handling */ + } /* End Kernel Height Loop */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 5x5 MOD_DWH 3D */ +/* dilated convolution function and 5x5 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE2(inTile, 2 + 2 * (XAI_CNN_CONV_GET_DILATIONX(param) - 1), 2 + 2 * (XAI_CNN_CONV_GET_DILATIONY(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + /* Max value of Gather Offset is ((stride*min(outW-1, 1) + 4 * dilationX) * inDataPitch1 + + * min(3, numInCh - 1)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) < \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 4 * XAI_CNN_CONV_GET_DILATION(param))), \ + XAI_ERR_BADARG, "dim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM1_PITCH(inTile), \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / (XAI_CNN_CONV_GET_STRIDE(param) * \ + XT_MIN(1, outW - 1) + 4 * XAI_CNN_CONV_GET_DILATION(param)))); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-(int32_t) ((dilatedKHeight / 2) * inDataPitch2 + (dilatedKWidth / 2) * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + + + /* 4 Gathers are being used to Load Input Data. Many common elements + * will be loaded in separate Gathers, especially in the case of + * stride 1 and 2. To take the advantage of having common offsets in + * a single Gather, 2 Gather Patterns are generated as given below. + * For example, in the Gather Patterns generated below, + * if stride is 1 and dilation equal to 1, then 8 offsets are common and if stride is 2, 4 offsets + * are common in each Gather. + */ + /* Gather Index Calculations */ + xb_vecNx16 vecGather = IVP_ANDNX16(IVP_SEQNX16(), 3); + IVP_MULANX16PACKL(vecGather, inDataPitch1 * dilationX, IVP_SRLINX16(IVP_SEQNX16(), 2)); + xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride * inDataPitch1); + + xb_vecNx16 vecSelIdx1 = IVP_SEQNX16(); + IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, 20, IVP_NOTBN(IVP_LTRNI(12))); + xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1); + xb_vecNx16 vecSelIdx2 = IVP_ADDNX16(IVP_SEQNX16(), 12); + IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, 20, IVP_NOTBN(IVP_LTRNI(8))); + xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2); + /* Index Pattern of vecGatherOff1 is - + * 0 1 2 3 d*P1 d*P1+1 d*P1+2 d*P1+3 d*2*P1 d*2*P1+1 d*2*P1+2 d*2*P1+3 + * s*P1 s*P1+1 s*P1+2 s*dP1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3 */ + + /* Index Pattern of vecGatherOff2 is - + * d*3*P1 d*3*P1+1 d*3*P1+2 d*3*P1+3 d*4*P1 d*4*P1+1 d*4*P1+2 d*4*P1+3 + * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 + * (s+3*d)*P1 (s+3*d)*P1+1 (s+3*d)*P1+2 (s+3*d)*P1+3 + * (s+4*d)*P1 (s+4*d)*P1+1 (s+4*d)*P1+2 (s+4*d)*P1+3 */ + + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remInCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remCh + 1); + uint8_t remCh2 = XT_SALT(3, remCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* 4 Gathers used for Input Data Load. Unrolled along */ + /* Output Width and Height by 2. Also, unrolled along */ + /* Input Channels by 4 and Kernel Width. */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Out Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used for corner case handling of Out Height odd */ + int32_t numY = XT_MIN(2, outH - y) - 1; + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff00; + xb_vecNx16U vecGatherOff01; + xb_vecNx16U vecGatherOff10; + xb_vecNx16U vecGatherOff11; + /* Variable used for corner case handling of Out Width odd */ + int32_t numX = XT_MIN(2, outW - x) - 1; + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean Vectors for Predicate Gather with corner cases */ + /* handled for Out Width and Height being odd numbers */ + vboolN vb1 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(20 * numX)); + vboolN vb2 = IVP_ORBN(IVP_LTRNI(8), IVP_LTRSN(20 * numX)); + vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(20 * numY)); + vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(20 * numY)); + + for (ky = 0; ky < 5; ky++) /* Kernel Height */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * dilationY * inDataPitch2; + pData2 = pData1 + (stride * inDataPitch2 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4); + + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Gather Load of Input Data */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4); + + + pData1 += 4; + pData2 += 4; + + /* kx = 1 */ + /* Extracting scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * \ + coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 2); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 4 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 3); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 5 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 4 * coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1)); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2)); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3)); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4)); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + dvecData4 = IVP_GATHERD2NX8_L(gather4); + + + /* kx = 1 */ + /* Extracting scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), \ + 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 2); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), \ + 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 2); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 4 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 3); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2)); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 5 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), \ + 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), \ + 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels corner case handling */ + } /* End Kernel Height */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * \ + numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate 7x7 MOD_DWH 3D */ +/* dilated convolution function and 7x7 MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported. */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDE(param) == 1) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 2) || \ + (XAI_CNN_CONV_GET_STRIDE(param) == 4), XAI_ERR_BADARG, \ + "\nStride = %hhu, value should be 1, 2 or 4", XAI_CNN_CONV_GET_STRIDE(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_EDGE(inTile, 3 + 3 * (XAI_CNN_CONV_GET_DILATION(param) - 1)); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + /* Max value of Gather Offset is ((stride*min(1,outW-1) + 6*dilationX) * inDataPitch1 + min(3,numInCh-1)) */ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_PITCH(inTile) < \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / \ + (XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outW - 1) + 6 * XAI_CNN_CONV_GET_DILATION(param))), \ + XAI_ERR_BADARG, "\ndim1Pitch value of inTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE3D_GET_DIM1_PITCH(inTile), \ + ((USHRT_MAX - XT_MIN(3, numInCh - 1)) / \ + (XAI_CNN_CONV_GET_STRIDE(param) * XT_MIN(1, outW - 1) + 6 * XAI_CNN_CONV_GET_DILATION(param)))); + } + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Effective Kernel size = dilation(KernelSize - 1) + 1 */ + /* Effective kernel size is used for calculating the min required edge */ + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedKWidth / 2) * inDataPitch1 + (dilatedKHeight / 2) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + + /* 4 Gathers are being used to Load Input Data. Many common elements + * will be loaded in separate Gathers, especially in the case of + * stride 1 and 2. To take the advantage of having common offsets in + * a single Gather, 2 Gather Patterns are generated as given below. + * For example, in the Gather Patterns generated below, + * if stride is 1 and dilation = 1, then 12 offsets are common and 8 offsets + * if stride is 2 and dilation = 1, are common in each Gather. + */ + /* Gather Index Calculations */ + xb_vecNx16 vecGather = IVP_ANDNX16(IVP_SEQNX16(), 3); + IVP_MULANX16PACKL(vecGather, inDataPitch1 * dilationX, IVP_SRLINX16(IVP_SEQNX16(), 2)); + xb_vecNx16 vecGather1 = IVP_ADDNX16(vecGather, stride * inDataPitch1); + + xb_vecNx16 vecSelIdx1 = IVP_SEQNX16(); + IVP_ADDNX16T(vecSelIdx1, vecSelIdx1, 16, IVP_NOTBN(IVP_LTRNI(16))); + xb_vecNx16U vecGatherOff1 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx1); + xb_vecNx16 vecSelIdx2 = IVP_ADDNX16(IVP_SEQNX16(), 16); + IVP_ADDNX16T(vecSelIdx2, vecSelIdx2, 16, IVP_NOTBN(IVP_LTRNI(12))); + xb_vecNx16U vecGatherOff2 = IVP_SELNX16(vecGather1, vecGather, vecSelIdx2); + /* Index Pattern of vecGatherOff1 is - + * 0 1 2 3 P1*d P1*d+1 P1*d+2 P1*d+3 2*P1*d 2*P1*d+1 2*P1*d+2 2*P1*d+3 3*P1*d 3*P1*d+1 3*P1*d+2 3*P1*d+3 + * s*P1 s*P1+1 s*P1+2 s*P1+3 (s+1*d)*P1 (s+1*d)*P1+1 (s+1*d)*P1+2 (s+1*d)*P1+3 + * (s+2*d)*P1 (s+2*d)*P1+1 (s+2*d)*P1+2 (s+2*d)*P1+3 */ + + /* Index Pattern of vecGatherOff2 is - + * 4*P1*d 4*P1*d+1 4*P1*d+2 4*P1*d+3 5*P1*d 5*P1*d+1 5*P1*d+2 5*P1*d+3 6*P1*d 6*P1*d+1 6*P1*d+2 6*P1*d+3 + * (s+3*d)*P1 (s+3*d)*P1+1 (s+3*d)*P1+2 (s+3*d)*P1+3 (s+4*d)*P1 (s+4*d)*P1*d+1 (s+4*d)*P1+2 (s+4*d)*P1+3 + * (s+5*d)*P1 (s+5*d)*P1+1 (s+5*d)*P1+2 (s+5*d)*P1+3 (s+6*d)*P1 (s+6*d)*P1+1 (s+6*d)*P1+2 (s+6*d)*P1+3 */ + + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecOut; + int8_t* restrict pData1; + int8_t* restrict pData2; + + int32_t remCh = numInCh & 3; + + /*Generation of maskLut for handling cases when remCh is not equal to 0 */ + /*eg. if remInCh is equal to 1 then sumMask is 0000FFFF */ + /* if remInCh is equal to 2 then sumMask is 00FFFFFF */ + const uint32_t maskLut[3] = { 0xff, 0xff00, 0xff0000 }; + + uint8_t remCh1 = XT_SALT(2, remCh + 1); + uint8_t remCh2 = XT_SALT(3, remCh + 1); + + uint32_t sumMask = maskLut[0] + maskLut[1] * remCh1 + maskLut[2] * remCh2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* 4 Gathers are used for Input Data Load corresponding to 4 */ + /* Output Vectors. Loop unrolled along Output Width and Height by 2. */ + /* Also unrolled along Input Channels by 4 and Kernel Width. */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable used for corner case handling of Out Height odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + xb_vecNx16U vecGatherOff00; + xb_vecNx16U vecGatherOff01; + xb_vecNx16U vecGatherOff10; + xb_vecNx16U vecGatherOff11; + /* Variable used for corner case handling of Out Width odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output, Input and Coefficient Data Pointers */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int8_t *pData = pInData + (x * stride) * inDataPitch1 + (y * stride) * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Boolean Vectors for Predicate Gather with corner cases */ + /* handled for Out Width and Height being odd numbers */ + vboolN vb1 = IVP_ORBN(IVP_LTRNI(16), IVP_LTRSN(28 * numX)); + vboolN vb2 = IVP_ORBN(IVP_LTRNI(12), IVP_LTRSN(28 * numX)); + vboolN vb3 = IVP_ANDBN(vb1, IVP_LTRSN(28 * numY)); + vboolN vb4 = IVP_ANDBN(vb2, IVP_LTRSN(28 * numY)); + + for (ky = 0; ky < 7; ky++) /* Kernel Height */ + { + /* Pointer for Input Data Load */ + pData1 = pData + ky * inDataPitch2 * dilationY; + pData2 = pData1 + (stride * inDataPitch2 * numY); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, vb1); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, vb2); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, vb3); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, vb4); + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Number of Input Channels */ + { + /* Gathers for Input Loads */ + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + xb_vec2Nx8 dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + xb_vec2Nx8 dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + xb_vec2Nx8 dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + xb_vec2Nx8 dvecData4 = IVP_GATHERD2NX8_L(gather4); + + pData1 += 4; + pData2 += 4; + + /* kx = 1 */ + /* Extracting Scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 4); + + /* 4 Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 2 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 3 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 4 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 5 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 6 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch2 - 3 * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + + /* kx = 7 */ + /* Extracting Scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6); + + /* 4 Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1 - 6 * coeffPitch2); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Handling Corner cases of Number of Input Channels not being multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vboolN vbRemInCh = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), remInCh); + /* Assign valid address for predicated false lines */ + vecGatherOff00 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb1)); + vecGatherOff01 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb2)); + vecGatherOff10 = IVP_MOVNX16UT(vecGatherOff1, 0, IVP_ANDBN(vbRemInCh, vb3)); + vecGatherOff11 = IVP_MOVNX16UT(vecGatherOff2, 0, IVP_ANDBN(vbRemInCh, vb4)); + + /* Gather Input Data */ + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + xb_gsr gather1 = IVP_GATHERANX8S(pData1, vecGatherOff00); + dvecData1 = IVP_GATHERD2NX8_L(gather1); + xb_gsr gather2 = IVP_GATHERANX8S(pData1, vecGatherOff01); + dvecData2 = IVP_GATHERD2NX8_L(gather2); + xb_gsr gather3 = IVP_GATHERANX8S(pData2, vecGatherOff10); + dvecData3 = IVP_GATHERD2NX8_L(gather3); + xb_gsr gather4 = IVP_GATHERANX8S(pData2, vecGatherOff11); + dvecData4 = IVP_GATHERD2NX8_L(gather4); + + + /* kx = 1 */ + /* Extracting scalars for QMULs */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 4); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 4); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + xb_vec2Nx8 dvecCoeff2; + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + xb_vec2Nx8 dvecCoeff3; + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2)); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 2 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh1 + remCh2))); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 3 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - coeffPitch1 * (remCh1 + remCh2)); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 4 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData1)), 3); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 3); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData3)), 3); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 3); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (coeffPitch1 * (remCh2 + remCh1))); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 5 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 4); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 4); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (remCh1 + remCh2) * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 6 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 5); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 1); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 5); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch2 - (remCh1 + remCh2) * coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + + /* kx = 7 */ + /* Extracting scalars for QMULs */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 2); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData2)), 6); + qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 2); + qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecData4)), 6); + + /* Aligned Vector Loads of coefficients */ + IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * remCh1); + IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * remCh2); + IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, 0); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1 & sumMask); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2 & sumMask); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3 & sumMask); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4 & sumMask); + } /* End Input Channels corner case handling */ + } /* End Kernel Height */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX + strideY * inDataPitch2 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaData1 = IVP_LA2NX8_PP(pdvecData1); + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : further optimized function if dim1Size == dim1Pitch */ +/* of 3D convolution for handling */ +/* cases where kwidth * numInch is a multiple of 4 */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); + xb_vecNx16U* restrict pOutScaleData; +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + int32_t numIter = kWidthU * numInCh; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < (outW - 3); x += 4) /* Image Width */ + { /* walk across the columns */ + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + if (x < outW) + { + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } /* End Output Channels */ + } +} + +/****************************************************************************/ +/* Description : further optimized function if dim1Size == dim1Pitch */ +/* of 3D convolution */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); + xb_vecNx16U* restrict pOutScaleData; +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW - 3; x += 4) /* Image Width */ + { /* walk across the columns */ + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner case handling as numIter is not a multiple of 4 */ + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + if (x < outW) + { + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + + /* Corner case handling as numIter is not a multiple of 4 */ + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } /* End Output Channels */ + } +} + +/***************xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH***********/ +/***************xaiConvolve3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH**************/ +/* Description : P6 optimized implementation for MxN MOD_DWH 3D convolution.*/ +/* with loop across outTile as outermost loop. For H=1 , The */ +/* outermost loop will be executed only once */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, \ + outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); + xb_vecNx16U* restrict pOutScaleData; +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + if ((dilatedkWidth % 2) != 0) + { + leftEdge = dilatedkWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1); + } + + if ((dilatedkHeight % 2) != 0) + { + topEdge = dilatedkHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, k, x, y; + int32_t inCh; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Loops Start */ + for (y = 0; y < outH; y++) + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; + +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + + for (x = 0; x < outW - 3; x += 4) /* Image Width */ + { + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff = 0, coeffAddrOff = 0; + + for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */ + { + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1); + pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 2); + pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 3); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaData1 = IVP_LA2NX8_PP(pdvecData1); + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } + } + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + if (x < outW) + { + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff = 0, coeffAddrOff = 0; + + for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */ + { + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaData1 = IVP_LA2NX8_PP(pdvecData1); + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } + } + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* inChannels is a multiple of 2 */ +/* Active data pointer is aligned to 2-bytes */ +/****************************************************************************/ +#ifndef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + uint16_t* restrict pData1; + uint16_t* restrict pData2; + uint16_t* restrict pData3; + uint16_t* restrict pData4; + xb_vec2Nx8* restrict pdvecOut; + + xb_vecNx16 vecData1, vecData2, vecData3, vecData4; + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = ((int8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pData1 = (uint16_t *) (pData + ky * inDataPitch2); + pData2 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX); + pData3 = (uint16_t *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY); + pData4 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX + strideY * inDataPitch2 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 2) /* (Input Channels * kWidth) loops combined */ + { + /* Load 2 bytes of input data */ + IVP_LSRNX16U_XP(vecData1, pData1, 2); + IVP_LSRNX16U_XP(vecData2, pData2, 2); + IVP_LSRNX16U_XP(vecData3, pData3, 2); + IVP_LSRNX16U_XP(vecData4, pData4, 2); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData1, vecData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData1 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData2 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData2, vecData2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData4 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData3, vecData3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData5 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData6 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData4, vecData4, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData7 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData8 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} +#endif + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Implementation also supports dilation > 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/****************************************************************************/ +/* Although this routine supports IVP_MULSUQA2N8XR8, it has been intentionally disabled because we are not using it for the core that supports IVP_MULSUQA2N8XR8. + We will be using convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4 and convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth. + These routines are faster than convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH */ +#ifndef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxNdX_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + if ((dilatedkWidth % 2) != 0) + { + leftEdge = dilatedkWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1); + } + + if ((dilatedkHeight % 2) != 0) + { + topEdge = dilatedkHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + /* Vector data pointers */ + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + +#ifndef IVP_MULSUQA2N8XR8 + /* Vector data registers */ + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = (XCHAL_IVPN_SIMD_WIDTH << 8); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); +#endif + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * inDataPitch1 + y * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * numX + inDataPitch2 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Priming input loads */ + valign vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaIn2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaIn3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaIn4 = IVP_LA2NX8U_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + /* Aligning variable vector load of pixels */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4); +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Input Channels */ + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + /* Aligning variable vector load of pixels */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSA2NX8(daccSum1, dvecData9, dvecCoeff3); + IVP_MULUSA2NX8(daccSum2, dvecData11, dvecCoeff3); + IVP_MULUSA2NX8(daccSum3, dvecData13, dvecCoeff3); + IVP_MULUSA2NX8(daccSum4, dvecData15, dvecCoeff3); +#endif + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} +#endif + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* No edges along dimension 1 of inTile */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_PITCH(inTile) == XAI_TILE3D_GET_DIM1(inTile)) \ + && XAI_CNN_CONV_GET_DILATION(param) == 1) || XAI_CNN_CONV_GET_DILATION(param) > 1), + XAI_ERR_BADARG, "Edges along input channels is not supported if dilation = 1."); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + +#ifdef IVP_MULSUQA2N8XR8 // only for Vision_130 + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } +#else // Vision_P6 + if (XAI_CNN_CONV_GET_DILATIONX(param) > 1 && XAI_CNN_CONV_GET_DILATIONY(param) > 1) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxNdX_U8S8IXCa2_MOD_DWH(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1) && \ + (XAI_TILE3D_GET_DIM1(inTile) % 2) == 0 \ + && ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(inTile)) & (2 - 1)) == 0)) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_depth2X_MOD_DWH(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + //int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((dilatedkWidth % 2) != 0) + { + leftEdge = dilatedkWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1); + } + + if ((dilatedkHeight % 2) != 0) + { + topEdge = dilatedkHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + /* Vector data pointers */ + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Vector data registers */ + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + + valign vaIn1, vaIn2, vaIn3, vaIn4; + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * numX + strideY * inDataPitch2 * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Priming input loads */ + vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + vaIn2 = IVP_LA2NX8U_PP(pdvecData2); + vaIn3 = IVP_LA2NX8U_PP(pdvecData3); + vaIn4 = IVP_LA2NX8U_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + + + /* Load 4 bytes of input data */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Input Channels */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + + /* Load 4 bytes of input data */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0); +#endif + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel * numX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * bytesPerPixel * numY); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 + outDataPitch2) * bytesPerPixel * numX * numY); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : P6 optimized implementation for noUnrollH MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* No edges along dimension 1 of inTile */ +/****************************************************************************/ +/* Although this routine supports IVP_MULSUQA2N8XR8, it has been intentionally disabled because we are not using it for the core that supports IVP_MULSUQA2N8XR8. + We will be using convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth_x4 and convolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH_contiguous_depth. + These routines are faster than convolvedVQ3D_S_MxNdX_U8S8IXCa2_MOD_DWH */ +#ifndef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeight = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); + xb_vecNx16U* restrict pOutScaleData; +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + if ((dilatedkWidth % 2) != 0) + { + leftEdge = dilatedkWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidth / 2) : ((dilatedkWidth / 2) - 1); + } + + if ((dilatedkHeight % 2) != 0) + { + topEdge = dilatedkHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeight / 2) : ((dilatedkHeight / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + /* Vector data pointers */ + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + +#ifndef IVP_MULSUQA2N8XR8 + /* Vector data registers */ + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = (XCHAL_IVPN_SIMD_WIDTH << 8); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); +#endif + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner cases */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * inDataPitch1 + y * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + + + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Priming input loads */ + valign vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaIn2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaIn3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaIn4 = IVP_LA2NX8U_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + /* Aligning variable vector load of pixels */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, 4); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, 4); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, 4); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, 4); +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Input Channels */ + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaIn1 = IVP_LA2NX8U_PP(pdvecData1); + xb_vec2Nx8U dvecInData1, dvecInData2, dvecInData3, dvecInData4; + /* Aligning variable vector load of pixels */ + IVP_LAV2NX8U_XP(dvecInData1, vaIn1, pdvecData1, remInCh); + IVP_LAV2NX8U_XP(dvecInData2, vaIn2, pdvecData2, remInCh); + IVP_LAV2NX8U_XP(dvecInData3, vaIn3, pdvecData3, remInCh); + IVP_LAV2NX8U_XP(dvecInData4, vaIn4, pdvecData4, remInCh); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSA2NX8(daccSum1, dvecData9, dvecCoeff3); + IVP_MULUSA2NX8(daccSum2, dvecData11, dvecCoeff3); + IVP_MULUSA2NX8(daccSum3, dvecData13, dvecCoeff3); + IVP_MULUSA2NX8(daccSum4, dvecData15, dvecCoeff3); +#endif + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} +#endif + +#ifndef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); + xb_vecNx16U* restrict pOutScaleData; +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + uint16_t* restrict pData1; + uint16_t* restrict pData2; + uint16_t* restrict pData3; + uint16_t* restrict pData4; + xb_vec2Nx8* restrict pdvecOut; + + xb_vecNx16 vecData1, vecData2, vecData3, vecData4; + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + /* Variables to handle corner cases */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = ((int8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pData1 = (uint16_t *) (pData + ky * inDataPitch2); + pData2 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth); + pData3 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * 2 * enable3rdWidth); + pData4 = (uint16_t *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 2) /* (Input Channels * kWidth) loops combined */ + { + /* Load 2 bytes of input data */ + IVP_LSRNX16U_XP(vecData1, pData1, 2); + IVP_LSRNX16U_XP(vecData2, pData2, 2); + IVP_LSRNX16U_XP(vecData3, pData3, 2); + IVP_LSRNX16U_XP(vecData4, pData4, 2); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData1, vecData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData1 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData2 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData2, vecData2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData4 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData3, vecData3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData5 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData6 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData4, vecData4, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData7 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData8 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} +#endif + +/****************************************************************************/ +/* Description : further optimized function if dim1Size == dim1Pitch */ +/* of 3D convolution for handling */ +/* cases where kwidth * numInch is a multiple of 4 */ +/****************************************************************************/ +#ifdef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + int32_t numIter = kWidthU * numInCh; + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + + for (x = 0; x < (outW - 3); x += 4) /* Image Width */ + { /* walk across the columns */ + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + if (x < outW) + { + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } /* End Output Channels */ + } +} +#endif + +/****************************************************************************/ +/* Description : further optimized function if dim1Size == dim1Pitch */ +/* of 3D convolution */ +/****************************************************************************/ +#ifdef IVP_MULSUQA2N8XR8 +#ifdef DILATED_VQ_CONV +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t numIter = kWidthU * numInCh; + + int32_t leftEdge, topEdge; + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW - 3; x += 4) /* Image Width */ + { /* walk across the columns */ + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 3 * strideX * inDataPitch1); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner case handling as numIter is not a multiple of 4 */ + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecData4; IVP_LAV2NX8U_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End image width */ + if (x < outW) + { + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 + 2 * strideX * inDataPitch1 * enable3rdWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + + /* Corner case handling as numIter is not a multiple of 4 */ + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecData3; IVP_LAV2NX8U_XP(dvecData3, vaData3, pdvecData3, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecData3)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } /* End Output Channels */ + } +} +#endif + +#ifdef DILATED_VQ_CONV +XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d, width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) > 0 && XAI_CNN_CONV_GET_DILATIONY(param) > 0), \ + XAI_ERR_BADARG, "dilation parameter has to be >= 1"); + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_PITCH(inTile) == XAI_TILE3D_GET_DIM1(inTile)) \ + && XAI_CNN_CONV_GET_DILATION(param) == 1) || XAI_CNN_CONV_GET_DILATION(param) > 1), + XAI_ERR_BADARG, "Edges along input channels is not supported if dilation = 1."); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } +#ifndef DILATED_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + +#ifdef IVP_MULSUQA2N8XR8 // only for Vision_130 + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, coeffTile, biasArray, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); + } +#else // Vision_P6 + if (XAI_CNN_CONV_GET_DILATIONX(param) > 1 && XAI_CNN_CONV_GET_DILATIONY(param) > 1) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxNdX_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } + /* If number of input channels is a multiple of 2 & + the active data pointer is aligned to 2-bytes, + call a more optimal variant */ + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1 && XAI_CNN_CONV_GET_DILATIONY(param) == 1) && \ + (XAI_TILE3D_GET_DIM1(inTile) % 2) == 0 \ + && ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(inTile)) & (2 - 1)) == 0)) + { +#ifdef DILATED_VQ_CONV + convolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_U8S8IXCa2_noUnrollH_depth2X_MOD_DWH(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedKWidth = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeight = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); + xb_vecNx16U* restrict pOutScaleData; +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t leftEdge, topEdge; + if ((dilatedKWidth % 2) != 0) + { + leftEdge = dilatedKWidth / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidth / 2) : ((dilatedKWidth / 2) - 1); + } + + if ((dilatedKHeight % 2) != 0) + { + topEdge = dilatedKHeight / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeight / 2) : ((dilatedKHeight / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + /* Vector data pointers */ + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + + /* Loops Start */ + for (y = 0; y < outH; y++) + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) /* Output Channels */ + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = (numOutCh - outCh); +#ifdef DILATED_VQ_CONV + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + + for (x = 0; x < outW - 3; x += 4) /* Image Width */ + { + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff = 0, coeffAddrOff = 0; + + for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */ + { + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 2); + pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 3); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInData4; IVP_LAV2NX8U_XP(dvecInData4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Input Channels */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaData1 = IVP_LA2NX8U_PP(pdvecData1); + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecInData4; IVP_LAV2NX8U_XP(dvecInData4, vaData4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData4)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0); +#endif + } + } + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + if (x < outW) + { + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = ((uint8_t *) pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2); + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff = 0, coeffAddrOff = 0; + + for (k = 0; k < kWidthU * kHeightU; k++) /* Kernel Height * Kernel Width */ + { + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); + +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + xb_vec2Nx8U dvecInData4 = 0; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, dvecCoeff4); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, dvecCoeff4); +#endif + } /* End Input Channels */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + vaData1 = IVP_LA2NX8U_PP(pdvecData1); + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInData1; IVP_LAV2NX8U_XP(dvecInData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecInData2; IVP_LAV2NX8U_XP(dvecInData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecInData3; IVP_LAV2NX8U_XP(dvecInData3, vaData3, pdvecData3, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInData3)), 0); +#else + xb_vec2Nx8U dvecData1, dvecData2, dvecData3, dvecData4; + xb_vec2Nx8U dvecData5, dvecData6, dvecData7, dvecData8; + xb_vec2Nx8U dvecData9, dvecData10, dvecData11, dvecData12; + xb_vec2Nx8U dvecData13, dvecData14, dvecData15, dvecData16; + xb_vecNx16 vecData1, vecData2; + xb_vecNx16 vecData3, vecData4; + xb_vecNx16 vecData5, vecData6; + xb_vecNx16 vecData7, vecData8; + xb_vecNx16 vecTemp1, vecTemp2; + xb_vec2Nx8U dvecInData4 = 0; + + /* Custom select pattern for DSELs */ + int16_t sel1 = ((XCHAL_IVPN_SIMD_WIDTH << 8)); + xb_vec2Nx8 vecSel1 = IVP_MOV2NX8_FROMNX16(sel1); + int16_t sel2 = (((XCHAL_IVPN_SIMD_WIDTH + 1) << 8) | 1); + xb_vec2Nx8 vecSel2 = IVP_MOV2NX8_FROMNX16(sel2); + + /* Broadcast a0, a1, a2, a3.... | b0, b1, b2, b3.... using DSELs into a0, a1, a0, a1.... | b0, b1, b0, b1.... */ + IVP_DSELNX16(vecData2, vecData1, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel1); + IVP_DSELNX16(vecData4, vecData3, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel1); + IVP_DSELNX16(vecData6, vecData5, IVP_MOVNX16_FROM2NX8U(dvecInData2), IVP_MOVNX16_FROM2NX8U(dvecInData1), vecSel2); + IVP_DSELNX16(vecData8, vecData7, IVP_MOVNX16_FROM2NX8U(dvecInData4), IVP_MOVNX16_FROM2NX8U(dvecInData3), vecSel2); + + /* Splitting 8 DSELI operations into 4 DSELIs and 8 SELIs for balancing loop schedule */ + /* Separate a0, a1, a0, a1 using SELIs into a0, a0, a0... */ + dvecData1 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData2 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData1), IVP_MOV2NX8U_FROMNX16(vecData1), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData3 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData4 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData2), IVP_MOV2NX8U_FROMNX16(vecData2), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData5 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData6 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData3), IVP_MOV2NX8U_FROMNX16(vecData3), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + dvecData7 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + dvecData8 = IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(vecData4), IVP_MOV2NX8U_FROMNX16(vecData4), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1); + + /* De-interleave a b a b a b... and move to a a a a... and b b b b... */ + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData5, vecData5, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData9 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData10 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData6, vecData6, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData11 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData12 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData7, vecData7, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData13 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData14 = IVP_MOV2NX8U_FROMNX16(vecTemp2); + IVP_DSELNX16I(vecTemp2, vecTemp1, vecData8, vecData8, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData15 = IVP_MOV2NX8U_FROMNX16(vecTemp1); dvecData16 = IVP_MOV2NX8U_FROMNX16(vecTemp2); +#endif + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); +#else + /* Multiply unsigned x signed and accumulate to 24-bits */ + IVP_MULUSPA2NX8(daccSum1, dvecData1, dvecCoeff1, dvecData2, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum2, dvecData3, dvecCoeff1, dvecData4, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum3, dvecData5, dvecCoeff1, dvecData6, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum4, dvecData7, dvecCoeff1, dvecData8, dvecCoeff2); + IVP_MULUSPA2NX8(daccSum1, dvecData9, dvecCoeff3, dvecData10, 0); + IVP_MULUSPA2NX8(daccSum2, dvecData11, dvecCoeff3, dvecData12, 0); + IVP_MULUSPA2NX8(daccSum3, dvecData13, dvecCoeff3, dvecData14, 0); + IVP_MULUSPA2NX8(daccSum4, dvecData15, dvecCoeff3, dvecData16, 0); +#endif + } + } + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#ifdef DILATED_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + } + } + return(XAI_ERROR_STATUS()); +} + +/******************************* end of VQ MOD variants ***************************************/ +/**********************************************************************************************/ +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c new file mode 100644 index 00000000000..ce103bda723 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef DILATED_VQ_CONV_S16 + +#include "cnn_dilated_conv_MOD_S16.h" + +/******************************* end of MOD variants ***************************************/ +/*******************************************************************************************/ +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h new file mode 100644 index 00000000000..8d2c920eb12 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOD_S16.h @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +/****************************************************************************************** +* MOD DWH variants +******************************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized implementation for MxN MOD_DWH */ +/* 3D convolution for S16for handling cases where */ +/* kwidth * numInch is not a multiple of 4 */ +/* Code implementation is generated during preprocessing stage*/ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported for dilation = 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is S16, CoeffData is S16 */ +/* biasArray is signed 64b, value not exceeding signed 48b */ +/* Output scale array is U16 */ +/* OutData is U16/S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_S16 +static _XAI_INLINE_ void convolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#else +static _XAI_INLINE_ void convolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int16_t *pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int16_t *pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int64_t *pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + +#ifdef DILATED_VQ_CONV_S16 + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + int32_t leftEdge, topEdge; + int32_t minLim, maxLim; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0; + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX; + } + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + int32_t numIter = kWidthU * numInCh; + + xb_vec2Nx8 *restrict pdvecBias = (xb_vec2Nx8 *) (pBiasData64); + xb_vecN_2x32v* restrict phvecIn1; + xb_vecN_2x32v* restrict phvecIn2; + xb_vecN_2x32v* restrict phvecIn3; + xb_vecN_2x32v* restrict phvecIn4; + + xb_vecNx16* restrict pvecCoeff; + xb_vecNx16* restrict pvecOut; + + valign vaOutData = IVP_ZALIGN(), vaBias = IVP_LA2NX8_PP(pdvecBias); + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of XCHAL_IVPN_SIMD_WIDTH*/ + xb_vecNx48 accBias48; + int32_t remainingOutCh = numOutCh - outCh; + ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remainingOutCh, accBias48); +#ifdef DILATED_VQ_CONV_S16 + xb_vecNx16U vecScaleData; + /*Load output scale values*/ + valign vaScale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16U_XP(vecScaleData, vaScale, pOutScaleData, 2 * remainingOutCh); +#endif + + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int16_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2); + + /* Initialize accumulators with bias values */ + xb_vecNx48 accSum1, accSum2, accSum3, accSum4; + accSum4 = accSum3 = accSum2 = accSum1 = accBias48; + + /* Input Data and Coeff Data Pointers */ + int16_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int16_t *pCoeff = pCoeffData + outCh; + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + phvecIn1 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2); + phvecIn2 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2 + strideX * inDataPitch1 * numX); + phvecIn3 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2 + strideY * inDataPitch2 * numY); + phvecIn4 = (xb_vecN_2x32v *) (pData + ky * inDataPitch2 + (strideX * \ + inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LAN_2X32_PP(phvecIn1); + valign vaData2 = IVP_LAN_2X32_PP(phvecIn2); + valign vaData3 = IVP_LAN_2X32_PP(phvecIn3); + valign vaData4 = IVP_LAN_2X32_PP(phvecIn4); + /* Pointer for Coefficient Load */ + pvecCoeff = (xb_vecNx16 *) (pCoeff + ky * coeffPitch3); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 8); + xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 8); + xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 8); + xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 8); + + /* Aligned Vector Loads of coefficients */ + xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1); + xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1); + xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1); + xb_vecNx16 vecCoeff4; IVP_L2UNX16_XP(vecCoeff4, pvecCoeff, 2 * coeffPitch1); + + IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0)); + IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0)); + IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0)); + IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0)); + + IVP_MULPAN16XR16(accSum1, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1)); + IVP_MULPAN16XR16(accSum2, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1)); + IVP_MULPAN16XR16(accSum3, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1)); + IVP_MULPAN16XR16(accSum4, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1)); + } /* End Input Channels */ + if (k < numIter) + { + int32_t remInCh = numIter - k; + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligning variable vector load of pixels */ + xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 2 * remInCh); + xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 2 * remInCh); + xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 2 * remInCh); + xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 2 * remInCh); + + /* Aligned Vector Loads of coefficients */ + xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1 * enable2); + xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1 * enable3); + xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1); + + IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0)); + IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0)); + IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0)); + IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0)); + + IVP_MULPAN16XR16(accSum1, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1)); + IVP_MULPAN16XR16(accSum2, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1)); + IVP_MULPAN16XR16(accSum3, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1)); + IVP_MULPAN16XR16(accSum4, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1)); + } + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4; +#ifdef DILATED_VQ_CONV_S16 + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, accSum1, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, accSum2, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, accSum3, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, accSum4, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, accSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, accSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, accSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, accSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* Store the output dvecOut1 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + outCh); + IVP_SAVNX16_XP(vecOut1, vaOutData, pvecOut, 2 * remainingOutCh); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Store the output dvecOut2 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1) * numX); + IVP_SAVNX16_XP(vecOut2, vaOutData, pvecOut, 2 * remainingOutCh * numX); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Store the output dvecOut3 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch2) * numY); + IVP_SAVNX16_XP(vecOut3, vaOutData, pvecOut, 2 * remainingOutCh * numY); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Store the output dvecOut4 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY)); + IVP_SAVNX16_XP(vecOut4, vaOutData, pvecOut, 2 * remainingOutCh * numX * numY); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D */ +/* dilated convolution function and MxN MOD_DWH 3D VQ */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is S16, CoeffData is S16 */ +/* biasArray is signed 64b, value not exceeding signed 48b */ +/* Output scale array is S16 */ +/* OutData is U16/S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 15. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* No edges along dimension 1 of inTile */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_S16 +XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#else +XAI_ERR_TYPE xaiConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_I16(outTile); + XAI_CHECK_TILE4D_S16(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S64(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel Width = %d, Kernel Height = %d\nKernel Width and Height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM3(coeffTile), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATION(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1) && (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilation = %hhu\nDilation should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and Dilation along height = %hhu\n \ + Dilation along width should be equal to dilation along height.", + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nAccumulator shift value = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nOutput shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); + +#ifdef DILATED_VQ_CONV_S16 + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + } + +#ifndef DILATED_VQ_CONV_S16 + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Calling further optimized function if dim1Size == dim1Pitch */ + if (XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile) && XAI_CNN_CONV_GET_DILATION(param) == 1) + { +#ifdef DILATED_VQ_CONV_S16 + convolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, outTile, param); +#else + convolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outTile, param); +#endif + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + const int32_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int16_t *pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int16_t *pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int64_t *pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); +#ifdef DILATED_VQ_CONV_S16 + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t dilatedKWidthU = dilationU * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationU * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + int32_t minLim, maxLim; + + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0; + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX; + } + + /* Variable Declarations */ + int32_t outCh, x, y, k, inCh; + valign vaOutData = IVP_ZALIGN(); + + /* Vector data pointers */ + xb_vec2Nx8 *restrict pdvecBias = (xb_vec2Nx8 *) (pBiasData64); + xb_vecN_2x32v* restrict phvecIn1; + xb_vecN_2x32v* restrict phvecIn2; + xb_vecN_2x32v* restrict phvecIn3; + xb_vecN_2x32v* restrict phvecIn4; + xb_vecNx16* restrict pvecCoeff; + xb_vecNx16* restrict pvecOut; + + valign vaBias = IVP_LA2NX8_PP(pdvecBias); + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of XCHAL_IVPN_SIMD_WIDTH*/ + xb_vecNx48 accBias48; + int32_t remainingOutCh = numOutCh - outCh; + ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remainingOutCh, accBias48); +#ifdef DILATED_VQ_CONV_S16 + xb_vecNx16U vecScaleData; + /*Load output scale values*/ + valign vaScale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16U_XP(vecScaleData, vaScale, pOutScaleData, 2 * remainingOutCh); +#endif + + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int16_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2); + + /* Initialize accumulators with bias values */ + xb_vecNx48 accSum1, accSum2, accSum3, accSum4; + accSum4 = accSum3 = accSum2 = accSum1 = accBias48; + + /* Input Data and Coeff Data Pointers */ + int16_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int16_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationU - kWidthU * inDataPitch1 * dilationU, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationU); + + /* Pointers for Input Data Loads */ + phvecIn1 = (xb_vecN_2x32v *) (pData + inAddrOff); + phvecIn2 = (xb_vecN_2x32v *) (pData + inAddrOff + strideX * inDataPitch1 * numX); + phvecIn3 = (xb_vecN_2x32v *) (pData + inAddrOff + strideY * inDataPitch2 * numY); + phvecIn4 = (xb_vecN_2x32v *) (pData + inAddrOff + (strideX * \ + inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LAN_2X32_PP(phvecIn1); + valign vaData2 = IVP_LAN_2X32_PP(phvecIn2); + valign vaData3 = IVP_LAN_2X32_PP(phvecIn3); + valign vaData4 = IVP_LAN_2X32_PP(phvecIn4); + + /* Pointer for Coefficient Load */ + pvecCoeff = (xb_vecNx16 *) (pCoeff + coeffAddrOff); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + /* Aligning variable vector load of pixels */ + xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 8); + xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 8); + xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 8); + xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 8); + + /* Aligned Vector Loads of coefficients */ + xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1); + xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1); + xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1); + xb_vecNx16 vecCoeff4; IVP_L2UNX16_XP(vecCoeff4, pvecCoeff, 2 * coeffPitch1); + + IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0)); + IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0)); + IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0)); + IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0)); + + IVP_MULPAN16XR16(accSum1, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1)); + IVP_MULPAN16XR16(accSum2, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1)); + IVP_MULPAN16XR16(accSum3, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1)); + IVP_MULPAN16XR16(accSum4, vecCoeff4, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1)); + } /* End Input Channels */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligning variable vector load of pixels */ + xb_vecN_2x32v hvecData1; IVP_LAVN_2X32_XP(hvecData1, vaData1, phvecIn1, 2 * remInCh); + xb_vecN_2x32v hvecData2; IVP_LAVN_2X32_XP(hvecData2, vaData2, phvecIn2, 2 * remInCh); + xb_vecN_2x32v hvecData3; IVP_LAVN_2X32_XP(hvecData3, vaData3, phvecIn3, 2 * remInCh); + xb_vecN_2x32v hvecData4; IVP_LAVN_2X32_XP(hvecData4, vaData4, phvecIn4, 2 * remInCh); + + /* Aligned Vector Loads of coefficients */ + xb_vecNx16 vecCoeff1; IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, 2 * coeffPitch1 * enable2); + xb_vecNx16 vecCoeff2; IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, 2 * coeffPitch1 * enable3); + xb_vecNx16 vecCoeff3; IVP_L2UNX16_XP(vecCoeff3, pvecCoeff, 2 * coeffPitch1); + + IVP_MULPAN16XR16(accSum1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData1, 0)); + IVP_MULPAN16XR16(accSum2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData2, 0)); + IVP_MULPAN16XR16(accSum3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData3, 0)); + IVP_MULPAN16XR16(accSum4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecData4, 0)); + + IVP_MULPAN16XR16(accSum1, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData1, 1)); + IVP_MULPAN16XR16(accSum2, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData2, 1)); + IVP_MULPAN16XR16(accSum3, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData3, 1)); + IVP_MULPAN16XR16(accSum4, 0, vecCoeff3, IVP_EXTRN_2X32(hvecData4, 1)); + } + } /* End Kernel Height * Width */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4; +#ifdef DILATED_VQ_CONV_S16 + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, accSum1, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, accSum2, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, accSum3, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, accSum4, packShiftAccU, \ + vecScaleData, outShiftU, minLim, maxLim); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, accSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, accSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, accSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, accSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* Store the output dvecOut1 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + outCh); + IVP_SAVNX16_XP(vecOut1, vaOutData, pvecOut, 2 * remainingOutCh); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Store the output dvecOut2 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1) * numX); + IVP_SAVNX16_XP(vecOut2, vaOutData, pvecOut, 2 * remainingOutCh * numX); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Store the output dvecOut3 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch2) * numY); + IVP_SAVNX16_XP(vecOut3, vaOutData, pvecOut, 2 * remainingOutCh * numY); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Store the output dvecOut4 along the output depth */ + pvecOut = (xb_vecNx16 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY)); + IVP_SAVNX16_XP(vecOut4, vaOutData, pvecOut, 2 * remainingOutCh * numX * numY); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ + return(XAI_ERROR_STATUS()); +} + +/******************************* end of VQ MOD variants ***************************************/ +/**********************************************************************************************/ +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c new file mode 100644 index 00000000000..72b1f7f9655 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV VQ_FALSE + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dilated_conv_MOW.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dilated_conv_MOW.h" +#endif //if ((XCHAL_VISION_TYPE >= 6)) + + + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h new file mode 100644 index 00000000000..9ab6fa6015a --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW.h @@ -0,0 +1,27240 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define VQ_TRUE 1 +#define VQ_FALSE 0 + +#undef MAKE_NAME_VQ +#undef MAKE_ARGUMENTS +#undef MAKE_PARAMS + +#if DILATED_VQ_CONV == VQ_TRUE + +#define MAKE_NAME_VQ(a, b) a ## VQ ## b +#define MAKE_ARGUMENTS(a, b, c, d, e) (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, const xai_pArray outputScaleArray, xai_pTile3D d, const xai_cnn_conv_params * e) +#define MAKE_PARAMS(a, b, c, d, e) (a, b, c, outputScaleArray, d, e) + +#elif DILATED_VQ_CONV == VQ_FALSE + +#define MAKE_NAME_VQ(a, b) a ## b +#define MAKE_ARGUMENTS(a, b, c, d, e) (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, xai_pTile3D d, const xai_cnn_conv_params * e) +#define MAKE_PARAMS(a, b, c, d, e) (a, b, c, d, e) +#endif + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix) name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix + +#if INPUT_DATA_TYPE == UNSIGNED8BIT + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, U8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8U +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8U_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8U_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8U_XP +#define MORPH_OP_L2_2Nx8 IVP_L2U2NX8U_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8U_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8U_XP +#define MORPH_OP_MULA IVP_MULUSA2N8XR16 +#define MORPH_OP_MUL4TA IVP_MULUS4TA2N8XR8 +#define MORPH_OP_MULQA IVP_MULUSQA2N8XR8 +#define MORPH_OP_MULPA IVP_MULUSPA2N8XR16 +#define MORPH_OP_GATHER IVP_GATHERANX8U +#define MORPH_OP_GATHER_2Nx8_LOW IVP_GATHERD2NX8U_L +#define MORPH_OP_GATHER_2Nx8_HIGH IVP_GATHERD2NX8U_H +#define MORPH_OP_DSELI IVP_DSEL2NX8UI +#define MORPH_OP_SEL IVP_SEL2NX8U + +#elif INPUT_DATA_TYPE == SIGNED8BIT + +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_2Nx8 +#undef MORPH_OP_PRIME_2Nx8 +#undef MORPH_OP_ALIGN_LOAD_2Nx8 +#undef MORPH_OP_LOAD_2Nx8_IP +#undef MORPH_OP_LOAD_2Nx8_VARIABLE +#undef MORPH_OP_LOAD_2Nx8 +#undef MORPH_OP_L2_2Nx8 +#undef MORPH_OP_MULA +#undef MORPH_OP_MUL4TA +#undef MORPH_OP_MULQA +#undef MORPH_OP_MULPA +#undef MORPH_OP_GATHER +#undef MORPH_OP_GATHER_2Nx8_LOW +#undef MORPH_OP_GATHER_2Nx8_HIGH +#undef MORPH_OP_DSELI +#undef MORPH_OP_SEL + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, S8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8 +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8_XP +#define MORPH_OP_L2_2Nx8 IVP_L2U2NX8_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8_XP +#define MORPH_OP_MULA IVP_MULA2N8XR16 +#define MORPH_OP_MUL4TA IVP_MUL4TA2N8XR8 +#define MORPH_OP_MULQA IVP_MULQA2N8XR8 +#define MORPH_OP_MULPA IVP_MULPA2N8XR16 +#define MORPH_OP_GATHER IVP_GATHERANX8S +#define MORPH_OP_GATHER_2Nx8_LOW IVP_GATHERD2NX8_L +#define MORPH_OP_GATHER_2Nx8_HIGH IVP_GATHERD2NX8_H +#define MORPH_OP_DSELI IVP_DSEL2NX8I +#define MORPH_OP_SEL IVP_SEL2NX8 +#endif + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_1x1j1d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 1x1 3D MOW_WHD dilated convolution function */ +/* and 1x1 3D VQ MOW_WHD dilated convolution function for U8 */ +/* bit and S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ +/********************************************************************************* + convolved3D_S_1x1j1d1_S8S8IX_MOW_WHD_NOEDGE + convolved3D_S_1x1j1d1_U8S8IX_MOW_WHD_NOEDGE + convolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD_NOEDGE + convolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD_NOEDGE + * MOW no edge variant * + * If DataPitch1 = width for input and output tile * + **********************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_NOEDGE) \ + MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + int32_t inCh, outCh, xy; + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + MORPH_IDT_2Nx8 *restrict pdvecIn3; + MORPH_IDT_2Nx8 *restrict pdvecIn4; + xb_vec2Nx8 * restrict pdvecOut; + xb_vec2Nx8 * restrict pdvecCoeff1; + xb_vec2Nx8 * restrict pdvecCoeff2; + xb_vec2Nx8 * restrict pdvecCoeff3; + xb_vec2Nx8 * restrict pdvecCoeff4; + + /* There are no edges input and output width. Output width and + * height loops are combined. Input data is loaded continuously + * from the input WH plane and output is stored continuously in + * output WH plane. + * The overall design approach is split into 2 sections, one + * with aligned input data and the other with unaligned input data. + */ + if (XAI_TILE3D_IS_ALIGNED_2NX8(inTile)) + { + for (xy = 0; xy < outW * outH; xy += vectorizationWidth) /* Loop across Output width */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[xy * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[xy]; + + /* initialize coeff and Bias data pointer */ + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to four output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* Coefficient and input pointers */ + int8_t *pCoeff = &pCoeffData[outCh * coeffPitch3]; + pdvecCoeff1 = (xb_vec2Nx8 *) pCoeff; + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn3 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn4 = (MORPH_IDT_2Nx8 *) (pInput + 3 * inDataPitch2); + + /* Priming Loads for Coefficients */ + valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1); + valign vaCoeff2 = IVP_LA2NX8_PP(pdvecCoeff2); + valign vaCoeff3 = IVP_LA2NX8_PP(pdvecCoeff3); + valign vaCoeff4 = IVP_LA2NX8_PP(pdvecCoeff4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, 4 * inDataPitch2); + + /* Read vector input data from 2nd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn2, 4 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn3, 4 * inDataPitch2); + + /* Read vector input data from 4th depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData4, pdvecIn4, 4 * inDataPitch2); + + xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4; + IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, 4); + IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, 4); + IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, 4); + IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, 4); + + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0); + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0); + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0); + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh - 3; inCh += 4)*/ + + /* Corner case handling if number of inCh is not a multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* Read vector input data from 1st depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, inDataPitch2 * XT_SALT(1, remInCh)); + + /* Read vector input data from 2nd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn1, inDataPitch2 * XT_SALT(2, remInCh)); + + /* Read vector input data from 3rd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn1, 0); + + xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4; + IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, remInCh); + IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, remInCh); + IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, remInCh); + IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, remInCh); + + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0); + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0); + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0); + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0); + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of corner case handling*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + int32_t varLen = outW * outH - xy; + + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (xy = 0; xy < outW*outH; xy += vectorizationWidth)*/ + } + else + { +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3, + * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3, + * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, ..... + * + * for e.g, if coeffPitch3 is 32: + * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,.. + * + * This sequence is used to gather coeff from 4 diff output channels, 4 each from + * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by + * 4 to make use of quad multipler. + */ + xb_vecNx16U vecIdx1 = IVP_SEQNX16(); + vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO); + xb_gsr gs0; + + for (xy = 0; xy < outW * outH; xy += vectorizationWidth) /* Loop across Output width * Outputheight */ + { + xb_vecNx16U vecIdx2; + /* variable store count */ + int32_t varLen = outW * outH - xy; + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[xy * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[xy]; + + /* initialize Bias data pointer */ + int32_t *pBias = &pBiasData[0]; + int8_t *pCoeff = &pCoeffData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* load and replicate bias data for each output channel */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* boolean mask to gather coeffs, if all the four o/p channels + * are present 16 coeff are loaded. + */ + vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask); + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* gather the coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + /* Read vector input data from 1st depth */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, varLen * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, varLen * XT_SALT(inCh, numInCh - 2)); + + /* Boolean mask for gather to handle cases where inCh<4 */ + vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1); + /* Gather coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of if (inCh < numInCh)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (xy = 0; xy < outW*outH; xy += vectorizationWidth)*/ + } +} + +/****************************************************************************************** +* MOW fold 16 Stride 1 varaint * +* If inDataPitch1 is lesser than or equal to * +* 16 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD16) \ + MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + int32_t inCh, outCh, y; + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + + xb_vec2Nx8 * restrict pdvecOut; + + xb_vecN_2x32v * restrict phvecCoeff1; + + + /* there are 2 implementations, one for + * input channels less than or equal to 64, and other for input channels + * greater than 64. + * Adding one more loop to support more than 64 input channels is causing + * significant overhead and degrades the the performance. + */ + + if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH)) + { + for (y = 0; y < outH; y += 4) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + + /* variables for coeff loads */ + xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4; + + /* read coeff vectors , for 4 consecutive output depths */ + /* coeff vector for 1st output channel */ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff1, coeffPitch3); + + /* coeff vector for 2nd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff1, coeffPitch3 * enable2ndCh); + + /* coeff vector for 3rd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff1, coeffPitch3 * enable3rdCh); + + /* coeff vector for 4th output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff1, coeffPitch3 * enable4thCh); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + /* Corner case handling if number of inCh is not a multiple of 4 */ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn1, inDataPitch2); + + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* In order to handle odd depths*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + int32_t enable3rdRow = XT_SALT(y, outH - 2); + int32_t enable4thRow = XT_SALT(y, outH - 3); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * enable4thRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 3 * outDataPitch1 * enable4thRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + 3 * outDataPitch1 * enable4thRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + 3 * outDataPitch1 * enable4thRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ + else + { +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3, + * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3, + * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, ..... + * + * for e.g, if coeffPitch3 is 32: + * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,.. + * + * This sequence is used to gather coeff from 4 diff output channels, 4 each from + * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by + * 4 to make use of quad multipler. + */ + xb_vecNx16U vecIdx1 = IVP_SEQNX16(); + vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO); + xb_gsr gs0; + + xb_vecNx16U vecIdx2; + for (y = 0; y < outH; y += 4) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + /* initialize Bias data pointer */ + + int32_t *pBias = &pBiasData[0]; + int8_t *pCoeff = &pCoeffData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* boolean mask to gather coeffs, if all the four o/p channels + * are present 16 coeff are loaded. + */ + vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask); + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* gather the coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + /* Read vector input data from 1st depth */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 2)); + + /* Boolean mask for gather to handle cases where inCh<4 */ + vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1); + /* Gather coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of if (inCh < numInCh)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* In order to handle odd depths*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + int32_t enable3rdRow = XT_SALT(y, outH - 2); + int32_t enable4thRow = XT_SALT(y, outH - 3); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * enable4thRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 3 * outDataPitch1 * enable4thRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + 3 * outDataPitch1 * enable4thRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + 3 * outDataPitch1 * enable4thRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ +} + + +/****************************************************************************************** +* MOW fold 32 Stride 1 varaint * +* If inDataPitch1 is lesser than or equal to * +* 16 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + int32_t inCh, outCh, y; + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + + xb_vec2Nx8 * restrict pdvecOut; + + xb_vecN_2x32v * restrict phvecCoeff1; + + + /* there are 2 implementations, one for + * input channels less than or equal to 64, and other for input channels + * greater than 64. + * Adding one more loop to support more than 64 input channels is causing + * significant overhead and degrades the the performance. + */ + + if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH)) + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + + /* variables for coeff loads */ + xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4; + + /* read coeff vectors , for 4 consecutive output depths */ + /* coeff vector for 1st output channel */ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff1, coeffPitch3); + + /* coeff vector for 2nd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff1, coeffPitch3 * enable2ndCh); + + /* coeff vector for 3rd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff1, coeffPitch3 * enable3rdCh); + + /* coeff vector for 4th output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff1, coeffPitch3 * enable4thCh); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + /* Corner case handling if number of inCh is not a multiple of 4 */ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn1, 0); + + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ + else + { +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3, + * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3, + * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, ..... + * + * for e.g, if coeffPitch3 is 32: + * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,.. + * + * This sequence is used to gather coeff from 4 diff output channels, 4 each from + * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by + * 4 to make use of quad multipler. + */ + xb_vecNx16U vecIdx1 = IVP_SEQNX16(); + vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO); + xb_gsr gs0; + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + xb_vecNx16U vecIdx2; + /* In order to handle odd rows*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + /* initialize Bias data pointer */ + + int32_t *pBias = &pBiasData[0]; + int8_t *pCoeff = &pCoeffData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* load and replicate bias data for each output channel */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* boolean mask to gather coeffs, if all the four o/p channels + * are present 16 coeff are loaded. + */ + vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + +#ifdef IS_VISION_130 + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + MORPH_OP_L2_2Nx8(dvecData1, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + MORPH_OP_L2_2Nx8(dvecData2, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + MORPH_OP_L2_2Nx8(dvecData3, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + MORPH_OP_L2_2Nx8(dvecData4, pdvecIn2, 3 * inDataPitch2); + + /* gather the coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + /* Read vector input data from 1st depth */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 2)); + + /* Boolean mask for gather to handle cases where inCh<4 */ + vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1); + /* Gather coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of if (inCh < numInCh)*/ +#else + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* gather the coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + /* Read vector input data from 1st depth */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * XT_SALT(inCh, numInCh - 2)); + + /* Boolean mask for gather to handle cases where inCh<4 */ + vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1); + /* Gather coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of if (inCh < numInCh)*/ +#endif + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 3rd output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut3H, dvecOut3L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 4th output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut4H, dvecOut4L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable4thCh * \ + enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ +} + +/****************** xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD ******************/ +/****************** xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD *****************/ +/****************** xaiConvolved3D_S_1x1j1d1_U8S8IX_MOW_WHD *****************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_1x1j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 1); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + /* if pitch = width in input and output tile call the no edge variant*/ + int32_t enableFlatten = ((inDataPitch1 == XAI_TILE3D_GET_DIM1(inTile)) && \ + (outDataPitch1 == outW) && (inDataPitch1 == outDataPitch1)); + + XAI_ERROR_CHECKS_CONTINUE() + { + if ((XAI_TILE3D_IS_ALIGNED_2NX8(inTile) == 0) && (enableFlatten || (numInCh > 2 * XCHAL_IVPN_SIMD_WIDTH))) + { + XAI_CHECK_TILE4D_FITS_IN_SINGLE_DRAM(coeffTile); + if (numOutCh > 1) + { + /* Max value of Gather Offset is (min(numOutCh-1,3)*coeffPitch3 + min(numInCh-1, 3)) */ + XAI_CHECK_ERROR(coeffPitch3 < ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3)), \ + XAI_ERR_BADARG, "\ndim3Pitch value of coeffTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + coeffPitch3, ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3))); + } + } + } + /* if pitch = width in input and output tile call the no edge variant*/ + if (enableFlatten) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_NOEDGE) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + /* check inDataPitch1, if it is less than or equal to 16, + * call FOLD16 variant and if it's greater than + * 16 but less than or equal to 32 call FOLD32 variant otherwise continue + */ + if (inDataPitch1 <= 16) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + if (inDataPitch1 <= 32) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_1x1j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + int32_t inCh, outCh, x, y; + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + MORPH_IDT_2Nx8 *restrict pdvecIn3; + MORPH_IDT_2Nx8 *restrict pdvecIn4; + xb_vec2Nx8 * restrict pdvecOut; + xb_vec2Nx8 * restrict pdvecCoeff1; + xb_vec2Nx8 * restrict pdvecCoeff2; + xb_vec2Nx8 * restrict pdvecCoeff3; + xb_vec2Nx8 * restrict pdvecCoeff4; + xb_vecN_2x32v * restrict phvecCoeff1; + + /* The overall design approach is split into 2 sections, one + * with aligned input data and the other with unaligned input data. + * The implementation with aligned input data gives the best performance */ + + /* In the unaligned input data case, there are 2 implementations, one for + * input channels less than or equal to 64, and other for input channels + * greater than 64. + * Adding one more loop to support more than 64 input channels is causing + * significant overhead and degrades the the performance. + */ + + if (XAI_TILE3D_IS_ALIGNED_2NX8(inTile)) + { + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + + /* Coefficient and input pointers */ + int8_t *pCoeff = &pCoeffData[outCh * coeffPitch3]; + pdvecCoeff1 = (xb_vec2Nx8 *) pCoeff; + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn3 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn4 = (MORPH_IDT_2Nx8 *) (pInput + 3 * inDataPitch2); + + /* Priming Loads for Coefficients */ + valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1); + valign vaCoeff2 = IVP_LA2NX8_PP(pdvecCoeff2); + valign vaCoeff3 = IVP_LA2NX8_PP(pdvecCoeff3); + valign vaCoeff4 = IVP_LA2NX8_PP(pdvecCoeff4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, 4 * inDataPitch2); + + /* Read vector input data from 2nd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn2, 4 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn3, 4 * inDataPitch2); + + /* Read vector input data from 4th depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData4, pdvecIn4, 4 * inDataPitch2); + + xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4; + IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, 4); + IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, 4); + IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, 4); + IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, 4); + + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0); + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0); + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0); + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh - 3; inCh += 4)*/ + + /* Corner case handling if number of inCh is not a multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* Read vector input data from 1st depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData1, pdvecIn1, inDataPitch2 * XT_SALT(1, remInCh)); + + /* Read vector input data from 2nd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData2, pdvecIn1, inDataPitch2 * XT_SALT(2, remInCh)); + + /* Read vector input data from 3rd depth */ + MORPH_OP_ALIGN_LOAD_2Nx8(dvecData3, pdvecIn1, 0); + + xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4; + IVP_LAV2NX8_XP(dvecCoeff1, vaCoeff1, pdvecCoeff1, remInCh); + IVP_LAV2NX8_XP(dvecCoeff2, vaCoeff2, pdvecCoeff2, remInCh); + IVP_LAV2NX8_XP(dvecCoeff3, vaCoeff3, pdvecCoeff3, remInCh); + IVP_LAV2NX8_XP(dvecCoeff4, vaCoeff4, pdvecCoeff4, remInCh); + + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff1)), 0); + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff2)), 0); + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff3)), 0); + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecCoeff4)), 0); + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of corner case handling*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + int32_t varLen = outW - x; + + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH)) + { + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + + /* variables for coeff loads */ + xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4; + + /* read coeff vectors , for 4 consecutive output depths */ + /* coeff vector for 1st output channel */ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff1, coeffPitch3); + + /* coeff vector for 2nd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff1, coeffPitch3 * enable2ndCh); + + /* coeff vector for 3rd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff1, coeffPitch3 * enable3rdCh); + + /* coeff vector for 4th output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff1); + IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff1, coeffPitch3 * enable4thCh); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh - 3; inCh += 4)*/ + /* Corner case handling if number of inCh is not a multiple of 4 */ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn1, inDataPitch2); + + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + int32_t varLen = outW - x; + + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ + else + { +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3, + * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3, + * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, ..... + * + * for e.g, if coeffPitch3 is 32: + * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,.. + * + * This sequence is used to gather coeff from 4 diff output channels, 4 each from + * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by + * 4 to make use of quad multipler. + */ + xb_vecNx16U vecIdx1 = IVP_SEQNX16(); + vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO); + xb_gsr gs0; + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* variable store count */ + int32_t varLen = outW - x; + xb_vecNx16U vecIdx2; + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize Bias data pointer */ + + int32_t *pBias = &pBiasData[0]; + int8_t *pCoeff = &pCoeffData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* load and replicate bias data for each output channel */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* boolean mask to gather coeffs, if all the four o/p channels + * are present 16 coeff are loaded. + */ + vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + /* Read vector input data from 1st depth */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData2, vaInData, pdvecIn1, 3 * inDataPitch2); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData3, vaInData, pdvecIn2, inDataPitch2); + + /* Read vector input data from 4th depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecData4, vaInData, pdvecIn2, 3 * inDataPitch2); + + /* gather the coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + pdvecIn2 = (MORPH_IDT_2Nx8 *) (((int8_t *) pdvecIn1) + 2 * inDataPitch2 * XT_SALT(inCh, numInCh - 2)); + /* Read vector input data from 1st depth */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecData1, vaInData, pdvecIn1, inDataPitch2 * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 2nd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LAV2NX8_XP(dvecData2, vaInData, pdvecIn1, varLen * XT_SALT(inCh, numInCh - 1)); + + /* Read vector input data from 3rd depth */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LAV2NX8_XP(dvecData3, vaInData, pdvecIn2, varLen * XT_SALT(inCh, numInCh - 2)); + + /* Boolean mask for gather to handle cases where inCh<4 */ + vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1); + /* Gather coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of if (inCh < numInCh)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ + } + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_1x1j2d1I8S8IX_MOW_WHD +* **************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 1x1 3D MOW_WHD dilated convolution function */ +/* and 1x1 3D VQ MOW_WHD dilated convolution function for U8 */ +/* bit and S8 bit input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_1x1j2d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_1x1j2d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_1x1j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 1); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + if (numInCh > 64) + { + XAI_CHECK_TILE4D_FITS_IN_SINGLE_DRAM(coeffTile); + if (numOutCh > 1) + { + /* Max value of Gather Offset is (min(numOutCh-1,3)*coeffPitch3 + min(numInCh-1, 3)) */ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3_PITCH(coeffTile) < \ + ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3)), XAI_ERR_BADARG, \ + "\ndim3Pitch value of coeffTile = %d, should be less than Gather Offset(16-bit limit) - %d", \ + XAI_TILE4D_GET_DIM3_PITCH(coeffTile), ((USHRT_MAX - XT_MIN(numInCh - 1, 3)) / XT_MIN(numOutCh - 1, 3))); + } + } + } + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* variable declarations */ + int32_t inCh, outCh, x, y; + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8 * restrict pdvecOut; + xb_vecN_2x32v * restrict phvecCoeff; + + + /* The overall design approach is split into 2 sections, one handles + * optimal tile sizes for giving best performance, other handles rest + * of the tile sizes */ + + /* If sections check out for optimal input tile size for best performance. + * if input tile depth is lesser than or equal to 64 use + * this design approach, otherwise jump to else part. Adding one more loop + * to support more than 64 input channels is causing significant overhead + * damaging the performance */ + if ((numInCh <= 2 * XCHAL_IVPN_SIMD_WIDTH)) + { + /* Loop structure Starts with loop across output channels */ + for (x = 0; x < outW; x += vectorizationWidth) /* loop across output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* coeff and input data vector declaration */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecDataL, dvecDataU; + xb_vecN_2x32v hvecCoeffData1, hvecCoeffData2, hvecCoeffData3, hvecCoeffData4; + + /* read coeff vectors , for 4 consecutive output depths */ + /* coeff vector for 1st output channel */ + phvecCoeff = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData; vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff); + IVP_LAVN_2X32_XP(hvecCoeffData1, vaCoeffData, phvecCoeff, coeffPitch3); + + /* coeff vector for 2nd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff); + IVP_LAVN_2X32_XP(hvecCoeffData2, vaCoeffData, phvecCoeff, coeffPitch3 * enable2ndCh); + + /* coeff vector for 3rd output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff); + IVP_LAVN_2X32_XP(hvecCoeffData3, vaCoeffData, phvecCoeff, coeffPitch3 * enable3rdCh); + + /* coeff vector for 4th output channel */ + vaCoeffData = IVP_LAN_2X32_PP(phvecCoeff); + IVP_LAVN_2X32_XP(hvecCoeffData4, vaCoeffData, phvecCoeff, coeffPitch3 * enable4thCh); + + /* load and replicate bias data for each output channel */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* Move data from N way 16 bit vecBias registers to + * 2N way 24 bit accumulators*/ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* loop across input channels */ + { + /* load data from 1st input channel */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + inDataPitch2 - vectorizationWidth * flag); + dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* load data from 2nd input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + 3 * inDataPitch2 - vectorizationWidth * flag); + dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* load data from 3rd input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \ + inDataPitch2 - vectorizationWidth * flag); + dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* load data from 4th input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \ + 3 * inDataPitch2 - vectorizationWidth * flag); + dvecData4 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* for (inCh = 0; inCh < numInCh; inCh += 4)*/ + + if (inCh < numInCh) + { + /* load data from 1st input channel */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + (inDataPitch2 - vectorizationWidth * flag) * XT_SALT(inCh, numInCh - 1)); + dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* load data from 2nd input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + (inDataPitch2 - vectorizationWidth * flag) * XT_SALT(inCh, numInCh - 2)); + dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* load data from 3rd input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + inDataPitch2 - vectorizationWidth * flag); + dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRVRN_2X32(hvecCoeffData1, inCh); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRVRN_2X32(hvecCoeffData2, inCh); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRVRN_2X32(hvecCoeffData3, inCh); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRVRN_2X32(hvecCoeffData4, inCh); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + int32_t varLen = outW - x; + + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH ; y++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } /* end of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ + else + { +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* generate the sequence 0,1,2,3,0 + coeffPitch3, 1 + coeffPitch3, + * 2 + coeffPitch3, 3 + coeffPitch3, 0 + 2 * coeffPitch3, 1 + 2 * coeffPitch3, + * 2 + 2 * coeffPitch3, 3 + 2 * coeffPitch3, ..... + * + * for e.g, if coeffPitch3 is 32: + * 0,1,2,3,32,33,34,35,64,65,66,67,96,96,98,99,.. + * + * This sequene is used to gather coeff from 4 diff output channels, 4 each from + * every channel corresponding to 4 i/p channels, as innermost loop(inCh) is unrolled by + * 4 to make use of quad multipler. + */ + xb_vecNx16U vecIdx1 = IVP_SEQNX16(); + vecIdx1 = IVP_PACKVRNRNX48(IVP_MULNX16(vecIdx1, coeffPitch3), 0); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 1), vecIdx1, IVP_SELI_16B_INTERLEAVE_1_LO); + vecIdx1 = IVP_SELNX16I(IVP_ADDNX16(vecIdx1, 2), vecIdx1, IVP_SELI_32B_INTERLEAVE_1_LO); + xb_gsr gs0; + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + xb_vecNx16U vecIdx2; + /* variable store count */ + int32_t varLen = outW - x; + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize Bias data pointer */ + + int32_t *pBias = &pBiasData[0]; + int8_t *pCoeff = &pCoeffData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* load and replicate bias data for each output channel */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* boolean mask to gather coeffs, if all the four o/p channels + * are present 16 coeff are loaded. + */ + vboolN mask = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) XT_MIN(((numOutCh - outCh) * 4), 16)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask); + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Loop across input depth */ + { + /* input vectors are read from 4 input depths at at time + * Scalar 32 bit coeff are extracted from the coeff vectors */ + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecDataL, dvecDataU; + + /* load data from 1st input channel */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + inDataPitch2 - vectorizationWidth * flag); + dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* load data from 2nd input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn1, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn1, \ + 3 * inDataPitch2 - vectorizationWidth * flag); + dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* load data from 3rd input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \ + inDataPitch2 - vectorizationWidth * flag); + dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* load data from 4th input channel */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecDataL, vaInData, pdvecIn2, vectorizationWidth * flag); + MORPH_OP_LOAD_2Nx8(dvecDataU, vaInData, pdvecIn2, \ + 3 * inDataPitch2 - vectorizationWidth * flag); + dvecData4 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* gather the coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of for (inCh = 0; inCh < numInCh; inCh += 4)*/ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + MORPH_IDT_2Nx8 dvecDataL, dvecDataU; + + /* load data from 1st input channel */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataL, vaInData, pdvecIn1, (inW - stride * x)); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataU, vaInData, pdvecIn1, \ + (inW - stride * x - vectorizationWidth)); + dvecData1 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* load data from 2nd input channel */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + ((inCh + 1) * inDataPitch2 * XT_SALT(inCh, numInCh - 1))); + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataL, vaInData, pdvecIn1, \ + (inW - stride * x) * XT_SALT(inCh, numInCh - 1)); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataU, vaInData, pdvecIn1, \ + (inW - stride * x - vectorizationWidth) * XT_SALT(inCh, numInCh - 1)); + dvecData2 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + + /* load data from 3rd input channel */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + ((inCh + 2) * inDataPitch2 * XT_SALT(inCh, numInCh - 2))); + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataL, vaInData, pdvecIn1, \ + (inW - stride * x) * XT_SALT(inCh, numInCh - 2)); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecDataU, vaInData, pdvecIn1, \ + (inW - stride * x - vectorizationWidth) * XT_SALT(inCh, numInCh - 2)); + dvecData3 = IVP_SEL2NX8I(dvecDataU, dvecDataL, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* Boolean mask for gather to handle cases where inCh<4 */ + vboolN mask1 = IVP_LTNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), (numInCh - inCh)); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx2, 0, mask1); + /* Gather coeffs */ + gs0 = IVP_GATHERANX8S(pCoeff + inCh, vecIdx2); + xb_vec2Nx8 dvecCoeffData = IVP_GATHERD2NX8_L(gs0); + + /* extract scalar coeff from coeff vectors */ + int32_t coeff1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 0); /* 1st o/p depth coeff */ + int32_t coeff2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 1); /* 2nd o/p depth coeff */ + int32_t coeff3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 2); /* 3rd o/p depth coeff */ + int32_t coeff4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData)), 3); /* 4th o/p depth coeff */ + + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, coeff1); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, coeff2); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, coeff3); + MORPH_OP_MULQA(dacc4, 0, dvecData3, dvecData2, dvecData1, coeff4); + } /* end of if (inCh < numInCh)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* store output to 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData; vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 3rd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* store output to 4th output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } /* end of else part of if ((numInCh <= 2*XCHAL_IVPN_SIMD_WIDTH))*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_1x1j4d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 1x1 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 1x1 3D MOW_WHD dilated convolution function */ +/* and 1x1 3D VQ MOW_WHD dilated convolution function for U8 */ +/* bit and S8 bit input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 1x1xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_1x1j4d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_1x1j4d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_1x1j4d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_1x1j4d1_U8S8IX_MOW_WHD *******************/ +//#if 0 +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_1x1j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 1); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile) == XAI_TILE4D_GET_DIM3(coeffTile), \ + XAI_ERR_BADARG, "\ninTile depth = %d, coeffTile depth = %d\ninTile depth should be same as coeffTile depth", \ + XAI_TILE3D_GET_DIM3(inTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) == XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_BADARG, "\noutTile depth = %d, number of kernels = %d\noutTile depth should be same as number of kernels", \ + XAI_TILE3D_GET_DIM3(outTile), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + xb_vec2Nx8* restrict pdvecOut; + xb_vecNx8* restrict pvecCoeff1, * restrict pvecCoeff2, \ + * restrict pvecCoeff3, * restrict pvecCoeff4; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + int32_t varLen; + + /* declare gather registers and compute the sequence + * 0,4,8,12, ....... 120, 124 required as offset for + * gahering data + */ + xb_gsr gs0, gs1; + xb_vecNx16U vecIdx1 = IVP_SEQNX16() << 2; + +#ifdef __XCC__ + XT_MEMW(); /* Adding Memory Wait as Gather and Normal Load/Stores are not synchronized */ +#endif + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* variable load and store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd heights */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + xb_vecNx16U vecIdx2; + /* Initialize o/p data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Initialize i/p data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride]; + + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pvecCoeff1 = (xb_vecNx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LANX8S_PP(pvecCoeff1); + + pvecCoeff2 = (xb_vecNx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LANX8S_PP(pvecCoeff2); + + pvecCoeff3 = (xb_vecNx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LANX8S_PP(pvecCoeff3); + + pvecCoeff4 = (xb_vecNx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LANX8S_PP(pvecCoeff4); + + /* mask for gathering input data based on varLen */ + vboolN mask1 = IVP_LTNX16(IVP_SEQNX16(), (xb_vecNx16) varLen); + /* Assign valid address for predicated false lines */ + vecIdx2 = IVP_MOVNX16UT(vecIdx1, 0, mask1); + + /* loop acrosss input channels is unrolled by 2, + * enabling us to use paired multipliers + */ + + /* 32 elements are gathered from the 1st input height + * in the gs0 register and then 32 elements are gathered + * from the next input height(as loop across output height + * is unrolled by 2) in the gs1 register. So lower half of + * the dvecData1 hold the data from 1st input height and + * upper half holds the data from the 2nd input height + * + * Similarly dvecData2 holds data from the 2nd input channel, + * lower half hold 1st input height and upper half holds 2nd + * input height + */ + + for (inCh = 0; inCh < numInCh - 1; inCh += 2) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + MORPH_IDT_2Nx8 dvecData1, dvecData2; + + /* loads data from 1st input channel */ + gs0 = MORPH_OP_GATHER(pInput, vecIdx2); + gs1 = MORPH_OP_GATHER(pInput + stride * inDataPitch1 * enable2ndRow, vecIdx2); + dvecData1 = MORPH_OP_GATHER_2Nx8_LOW(gs0); + MORPH_OP_GATHER_2Nx8_HIGH(dvecData1, gs1); + pInput += inDataPitch2; + + /* loads data from next input channel */ + gs0 = MORPH_OP_GATHER(pInput, vecIdx2); + gs1 = MORPH_OP_GATHER(pInput + stride * inDataPitch1 * enable2ndRow, vecIdx2); + dvecData2 = MORPH_OP_GATHER_2Nx8_LOW(gs0); + MORPH_OP_GATHER_2Nx8_HIGH(dvecData2, gs1); + pInput += inDataPitch2; + + /* load 2 coeff for all the 4 output channels, 8 to 16 bit + * conversion is taken care of by the load instruction */ + xb_vecNx16 vecCoeffData1, vecCoeffData2, vecCoeffData3, vecCoeffData4; + IVP_LAVNX8S_XP(vecCoeffData1, vaCoeffData1, pvecCoeff1, 2); + IVP_LAVNX8S_XP(vecCoeffData2, vaCoeffData2, pvecCoeff2, 2); + IVP_LAVNX8S_XP(vecCoeffData3, vaCoeffData3, pvecCoeff3, 2); + IVP_LAVNX8S_XP(vecCoeffData4, vaCoeffData4, pvecCoeff4, 2); + + /* multiply data from 1st input channel with 1st coeff + * and data from 2nd input channel with 2nd coeff and + * accumulate + */ + MORPH_OP_MULPA(dacc1, dvecData2, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData1), 0)); + MORPH_OP_MULPA(dacc2, dvecData2, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData2), 0)); + MORPH_OP_MULPA(dacc3, dvecData2, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData3), 0)); + MORPH_OP_MULPA(dacc4, dvecData2, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData4), 0)); + } /* end of for (inCh = 0; inCh < numInCh - 1; inCh += 2)*/ + + /* handles left out odd input channel */ + if (inCh < numInCh) + { + MORPH_IDT_2Nx8 dvecData1; + + /* loads data from the left out input channel */ + gs0 = MORPH_OP_GATHER(pInput, vecIdx2); + gs1 = MORPH_OP_GATHER(pInput + stride * inDataPitch1 * enable2ndRow, vecIdx2); + dvecData1 = MORPH_OP_GATHER_2Nx8_LOW(gs0); + MORPH_OP_GATHER_2Nx8_HIGH(dvecData1, gs1); + + xb_vecNx16 vecCoeffData1, vecCoeffData2, vecCoeffData3, vecCoeffData4; + IVP_LAVNX8S_XP(vecCoeffData1, vaCoeffData1, pvecCoeff1, 1); + IVP_LAVNX8S_XP(vecCoeffData2, vaCoeffData2, pvecCoeff2, 1); + IVP_LAVNX8S_XP(vecCoeffData3, vaCoeffData3, pvecCoeff3, 1); + IVP_LAVNX8S_XP(vecCoeffData4, vaCoeffData4, pvecCoeff4, 1); + + MORPH_OP_MULPA(dacc1, 0, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData1), 0)); + MORPH_OP_MULPA(dacc2, 0, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData2), 0)); + MORPH_OP_MULPA(dacc3, 0, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData3), 0)); + MORPH_OP_MULPA(dacc4, 0, dvecData1, \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(vecCoeffData4), 0)); + } /* end of if(inCh < numInCh)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the upper half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 and store + * in the next output height + */ + dvecOut1L = IVP_SEL2NX8I(0, dvecOut1L, IVP_SELI_8B_EXTRACT_HI_HALVES); + dvecOut2L = IVP_SEL2NX8I(0, dvecOut2L, IVP_SELI_8B_EXTRACT_HI_HALVES); + dvecOut3L = IVP_SEL2NX8I(0, dvecOut3L, IVP_SELI_8B_EXTRACT_HI_HALVES); + dvecOut4L = IVP_SEL2NX8I(0, dvecOut4L, IVP_SELI_8B_EXTRACT_HI_HALVES); + + /* Storing the 2nd row outputs, 1st channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, (-typeFlag + 1) * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, (-typeFlag + 1) * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, (-typeFlag + 1) * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MOW fold 16 Stride 1 * +* If inDataPitch1 is lesser than or equal to * +* 16 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + topEdge = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (y = 0; y < outH - 3; y += 4) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + + for (inCh = 0; inCh < numInCh - 1; inCh += 2) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp; + + /* Process first input channel */ + /* load data from 4 input rows [Row0 | Row1 | Row2 | Row3] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from next 4 input rows [Row1 | Row2 | Row3 | Row4] */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load all the 2x2 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8); + + /* Get co-efficients for first channel */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 1st output channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 1 of 2nd output channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + + /* Process second input channel */ + /* load data from 4 input rows [Row0 | Row1 | Row2 | Row3] */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 4 input rows [Row1 | Row2 | Row3 | Row4] */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Get co-efficients for second channel */ + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1); + + /* Compute Row 1 of 1st output channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 1 of 2nd output channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + if (inCh < numInCh) + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp; + + /* load data from 2 input rows [Row0 | Row1 | Row2 | Row3] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 2 input rows [Row1 | Row2 | Row3 | Row4] */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load all the 2x2 coefficients for 2 output depths */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 4); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 4); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 1st output channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 1 of 2nd output channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + } + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, third row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, fourth row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, third row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, fourth row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH-3; y += 4)*/ + + /* handle left out output rows */ + if (y < outH) + { + int32_t enable2ndRow = XT_SALT(y, outH - 1); + int32_t enable3rdRow = XT_SALT(y, outH - 2); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths and heights */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh - 1; inCh += 2) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + + /* load data from first 4 input rows [Row0 | Row1 | Row2 | Row3] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData1, pdvecIn1, (2 + enable2ndRow + enable3rdRow) * inDataPitch1); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from next 4 input rows [Row1 | Row2 | Row3 | Row4]*/ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData2, pdvecIn2, (1 + enable2ndRow + enable3rdRow) * inDataPitch1); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + pInput += inDataPitch2; + + /* load all the 2x2 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8 * enable2ndCh); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 1st output channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 1 of 2nd output channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + + /* Process next input channel */ + /* load data from 2 input rows [Row0 | Row1 | Row2 | Row3] */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData1, pdvecIn1, (2 + enable2ndRow + enable3rdRow) * inDataPitch1); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 2 input rows [Row4 | Row5 | Row6 | Row7] */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData2, pdvecIn2, (1 + enable2ndRow + enable3rdRow) * inDataPitch1); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + pInput += inDataPitch2; + + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1); + + /* Compute Row 1 of 1st output channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 1 of 2nd output channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + if (inCh < numInCh) + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + + /* load data from 2 input rows [Row0 | Row1 | Row2 | Row3] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData1, pdvecIn1, (2 + enable2ndRow + enable3rdRow) * inDataPitch1); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 2 input rows [Row4 | Row5 | Row6 | Row7] */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData2, pdvecIn2, (1 + enable2ndRow + enable3rdRow) * inDataPitch1); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + pInput += inDataPitch2; + + /* load all the 2x2 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 4); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 4); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 1st output channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 1 of 2nd output channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + } + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of if(y < outH)*/ +} + +/****************************************************************************************** +* MOW fold 32 Stride 1 * +* If inDataPitch1 is lesser than or equal to * +* 32 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + topEdge = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* in order to hanlde odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + /* load data from first 2 input rows */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + + + for (inCh = 0; inCh < numInCh - 1; inCh += 2) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp; + + /* load data from 2 input rows [Row0 | Row1] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 2 input rows [Row1 | Row2] */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load all the 2x2 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 1st channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 2 of 1st channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + + /* Process input channel2 */ + /* load data from 2 input rows [Row0 | Row1] */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 2 input rows [Row1 | Row2] */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1); + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1); + + /* Compute Row 1 of 1st channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 2 of 1st channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + if (inCh < numInCh) + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp; + + /* load data from 2 input rows [Row0 | Row1] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch2); + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load data from 2 input rows [Row1 | Row2] */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch2); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load all the 2x2 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 8); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 8); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 1st channel */ + MORPH_OP_MULQA(dacc1, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + + /* Compute Row 2 of 1st channel */ + MORPH_OP_MULQA(dacc2, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + } + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2Row * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2Row * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ +} + + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_2x2j1d1I8S8IX_MOW_WHD +* **************************************************************************/ +/********************************************************************************/ +/* Description : P6 optimized generic implementation for 2x2 3D convolution with*/ +/* dilation = 1. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method can be used to generate 2x2 3D MOW_WHD convolution */ +/* function and 2x2 3D VQ MOW_WHD convolution function for U8 bit */ +/* and S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 2x2xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/********************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_2x2j1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_2x2j1d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_2x2j1d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_2x2j1d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_2x2j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 2); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* check inDataPitch1, if it is less than or equal to 32, + * call FOLD32 variant otherwise continue + */ + + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_2x2j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + topEdge = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + MORPH_IDT_2Nx8 *restrict pdvecIn3; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* in order to handle odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 daccSum11, daccSum21, daccSum12, daccSum22; + + daccSum11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(daccSum11, hvecBias1, hvecBias1); + + daccSum12 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(daccSum12, hvecBias1, hvecBias1); + + daccSum21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(daccSum21, hvecBias2, hvecBias2); + + daccSum22 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(daccSum22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + (coeffPitch3 * enable2ndCh)); + + /* Input vector pointer initialization- 1st input channel */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + pdvecIn3 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch1 * enable2Row); + + for (inCh = 0; inCh < numInCh - 1; inCh += 2) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3; + + /* load all the 2x2 coefficients for 1st output channel*/ + valign vaCoeffData; + vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff1); + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff1, 8); + + /* load all the 2x2 coefficients for 2nd output channel*/ + vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData, pdvecCoeff2, 8); + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn1, inDataPitch2); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn2, inDataPitch2); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn3); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn3, inDataPitch2); + + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp, dvecInData3temp; + + /* Reorder/ rotate the input required for filter kernel computation */ + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecInData3temp = IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + + /* Compute Row 1 of 1st channel */ + MORPH_OP_MULQA(daccSum11, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + /* Compute Row 2 of 1st channel */ + MORPH_OP_MULQA(daccSum12, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar1); + + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 2nd channel */ + MORPH_OP_MULQA(daccSum21, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + /* Compute Row 2 of 2nd channel */ + MORPH_OP_MULQA(daccSum22, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar2); + + /* load data from first input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn1, inDataPitch2); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn2, inDataPitch2); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn3); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn3, inDataPitch2); + + /* Reorder/ rotate the input required for filter kernel computation */ + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecInData3temp = IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1); + + /* Compute Row 1 of 1st channel */ + MORPH_OP_MULQA(daccSum11, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + /* Compute Row 2 of 1st channel */ + MORPH_OP_MULQA(daccSum12, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar1); + + qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1); + + /* Compute Row 1 of 2nd channel */ + MORPH_OP_MULQA(daccSum21, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + /* Compute Row 2 of 2nd channel */ + MORPH_OP_MULQA(daccSum22, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar2); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + if (inCh < numInCh) /*Control flow to handle final row for odd input channel count*/ + { + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3; + + /* load all the 2x2 coefficients for 1st output channel*/ + valign vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff1); + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff1, 4); + + /* load all the 2x2 coefficients for 2nd output channel*/ + vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData, pdvecCoeff2, 4); + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn1, inDataPitch2); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn2, inDataPitch2); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn3); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn3, inDataPitch2); + + MORPH_IDT_2Nx8 dvecInData1temp, dvecInData2temp, dvecInData3temp; + + /* Reorder/ rotate the input required for filter kernel computation */ + dvecInData1temp = IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecInData2temp = IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecInData3temp = IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0); + + /* Compute Row 1 of 1st channel */ + MORPH_OP_MULQA(daccSum11, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar1); + /* Compute Row 2 of 1st channel */ + MORPH_OP_MULQA(daccSum12, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar1); + + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0); + + /* Compute Row 1 of 2nd channel */ + MORPH_OP_MULQA(daccSum21, dvecInData2temp, dvecInData2, dvecInData1temp, dvecInData1, qmulScalar2); + /* Compute Row 2 of 2nd channel */ + MORPH_OP_MULQA(daccSum22, dvecInData3temp, dvecInData3, dvecInData2temp, dvecInData2, qmulScalar2); + } + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L; + xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, daccSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, daccSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, daccSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, daccSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, daccSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, daccSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, daccSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, daccSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \ + enable2Row * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2Row * varLen); + IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \ + enable2Row * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_3x3j1d1I8S8IX_MOW_WHD +* **************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 3D VQ convolution*/ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 3x3 MOW_WHD 3D dilated convolution function */ +/* and 3x3 MOW_WHD 3D VQ dilated convolution function for U8 */ +/* bit and S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ +/****************************************************************************************** +* MOW fold 16 Stride 1 varaint * +* If inDataPitch1 is lesser than or equal to * +* 16 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* Generating the shuffle pattern for coefficent loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* Select sequence to re-arrange input data */ + xb_vec2Nx8 dvecSeq = 0; + IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)); + IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1), (2 * XCHAL_IVPN_SIMD_WIDTH), \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1))); + + /* loop across output channels is unrolled twice and + * loop across output height is unrolled 4 times + */ + for (y = 0; y < outH - 3; y += 4) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths and heights */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Input vector pointer initialization */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + + /* load data from first 4 input rows */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, 4 * inDataPitch1); + + /* load data from next 4 input rows */ + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch2 - (4 * inDataPitch1)); + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9); + + /* Rearrange them so that zero is inserted where the MUL4T should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* dvecInData1 contains first 4 input rows and + * dvecInData2 contains next 4 input rows. + * dvecInData1: row0 | row1 | row2 | row3 + * dvecInData2: row4 | row5 | row6 | row7 + * + * Input data is re arranged in such a manner that + * dvecTemp1 contains: row1 | row2 | row3 | row4 + * dvecTemp2 contains: row2 | row3 | row4 | row5 + */ + xb_vec2Nx8 dvecTemp1, dvecTemp2; + dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq); + dvecTemp2 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData2, dvecSeq), dvecTemp1, dvecSeq); + + /* Multiply input data with coefficients from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc1, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply input data with coefficients from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 4th row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH - 3; y += 4)*/ + /* handle left out output rows */ + if (y < outH) + { + int32_t enable2ndRow = XT_SALT(y, outH - 1); + int32_t enable3rdRow = XT_SALT(y, outH - 2); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths and heights */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + + /* load data from first 4 input rows */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData1, vaInData, pdvecIn, (3 + enable2ndRow) * inDataPitch1); + + /* load data from next 4 input rows */ + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData, pdvecIn, (enable2ndRow + enable3rdRow) * inDataPitch1); + pInput += inDataPitch2; + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9); + + /* Rearrange them so that zero is inserted where the MUL4T should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* dvecInData1 contains first 4 input rows and + * dvecInData2 contains next 4 input rows. + * dvecInData1: row0 | row1 | row2 | row3 + * dvecInData2: row4 | row5 | row6 | row7 + * + * Input data is re arranged in such a manner that + * dvecTemp1 contains: row1 | row2 | row3 | row4 + * dvecTemp2 contains: row2 | row3 | row4 | row5 + */ + xb_vec2Nx8 dvecTemp1, dvecTemp2; + dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq); + dvecTemp2 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData2, dvecSeq), dvecTemp1, dvecSeq); + + /* Multiply input data with coefficients from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc1, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply input data with coefficients from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecTemp2, dvecTemp2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of if(y < outH)*/ +} + +/****************************************************************************************** +* MOW fold 32 Stride 1 * +* If inDataPitch1 is lesser than or equal to * +* 32 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* Generating the shuffle pattern for coefficent loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + + /* Select sequence to re-arrange input data */ + xb_vec2Nx8 dvecSeq = 0; + IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1)); + IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), inDataPitch1), (2 * XCHAL_IVPN_SIMD_WIDTH), \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1))); + + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* in order to hanlde odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2; + + /* load data from first 2 input rows */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, 2 * inDataPitch1); + + /* load data from next 2 input rows */ + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData2, vaInData, pdvecIn, inDataPitch1 \ + + inDataPitch1 * enable2Row); + pInput += inDataPitch2; + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9); + + /* Rearrange them so that zero is inserted where the MUL4T should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* dvecInData1 contains first 2 input rows and + * dvecInData2 contains next 2 input rows. + * dvecInData1: row0 | row1 + * dvecInData2: row2 | row3 + * + * dvecInData1 is multipled with 1st row of coffecient and + * dvecInData2 is multipled with 3rd row of coeffecient. + * + * To multiply input data with 2nd coefficient row, it is required + * to store row1 and row 2 in another vector + * + * dvecTemp: row1 | row2 + * + * So first inDataPitch1 elements in the accumulator corresponds to + * first output row and next inDataPitch1 number of elements corresponds + * to 2nd output row. + */ + xb_vec2Nx8 dvecTemp1; + dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq); + + /* Multiply input data with coefficients from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply input data with coefficients from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecTemp1, dvecTemp1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2Row * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2Row * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ +} + +/****************** xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_3x3j1d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_3x3j1d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 1); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* check inDataPitch1, if it is less than or equal to 16, + * call FOLD16 varaint and if it's greater than + * 16 but less than or equal to 32 call FOLD32 variant otherwise continue + */ + + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* Generating the shuffle pattern for coefficent loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* in order to handle odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Input vector pointer initialization */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1 * enable2Row); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch2 - (2 + enable2Row) * inDataPitch1); + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9); + + /* Rearrange them so that zero is inserted where the MUL4T should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2Row * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2Row * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2Row * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \ + enable2Row * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_3x3j2d1I8S8IX_MOW_WHD +* **************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized implementation for 3x3 3D convolution with */ +/* stride = 2. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method can be used to generate 3x3 3D dilated convolution */ +/* function and 3x3 3D VQ dilated convolution function for U8 */ +/* bit and S8 bit input data. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/******************* convolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD_INCHANNEL3 *****************/ +/******************* convolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD_INCHANNEL3 *****************/ +/******************* convolved3D_S_3x3j2d1_S8S8IX_MOW_WHD_INCHANNEL3 *****************/ +/******************* convolved3D_S_3x3j2d1_U8S8IX_MOW_WHD_INCHANNEL3 *****************/ +/* If number of input channel is 3 this function is called */ +/*****************************************************************************************/ +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j2d1), S8IX_MOW_WHD_INCHANNEL3) \ + MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + MORPH_IDT_2Nx8* restrict pdvecIn2; + MORPH_IDT_2Nx8* restrict pdvecIn3; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vecN_2x32v* restrict phvecBias; + int32_t outCh, y, x; + + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + /* inCh = 1 */ + MORPH_IDT_2Nx8 dvecDataCh101, dvecDataCh102, dvecDataCh103; + MORPH_IDT_2Nx8 dvecDataCh111, dvecDataCh112, dvecDataCh113; + MORPH_IDT_2Nx8 dvecDataCh121, dvecDataCh122, dvecDataCh123; + /* inCh = 2 */ + MORPH_IDT_2Nx8 dvecDataCh201, dvecDataCh202, dvecDataCh203; + MORPH_IDT_2Nx8 dvecDataCh211, dvecDataCh212, dvecDataCh213; + MORPH_IDT_2Nx8 dvecDataCh221, dvecDataCh222, dvecDataCh223; + /* inCh = 3 */ + MORPH_IDT_2Nx8 dvecDataCh301, dvecDataCh302, dvecDataCh303; + MORPH_IDT_2Nx8 dvecDataCh311, dvecDataCh312, dvecDataCh313; + MORPH_IDT_2Nx8 dvecDataCh321, dvecDataCh322, dvecDataCh323; + + /* input vectors for inCh = 1 */ + xb_vec2Nx8 dvecInDataCh1_1, dvecInDataCh1_2, dvecInDataCh1_3, dvecInDataCh1_4, dvecInDataCh1_5; + /* input vectors for inCh = 2 */ + xb_vec2Nx8 dvecInDataCh2_1, dvecInDataCh2_2, dvecInDataCh2_3, dvecInDataCh2_4, dvecInDataCh2_5; + /* input vectors for inCh = 3 */ + xb_vec2Nx8 dvecInDataCh3_1, dvecInDataCh3_2, dvecInDataCh3_3, dvecInDataCh3_4, dvecInDataCh3_5; + + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* Generating the select pattern for coefficent loads. + * Pattern : 27, 28, 29, 30, .... + */ + xb_vec2Nx8 dvecSeq = IVP_ADD2NX8(IVP_SEQ2NX8(), coeffPitch3); + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + int32_t remX = bytesPerPixel * varLen; + + for (y = 0; y < outH; y += 2) + { + /* in order to handle odd output height */ + int32_t enable2outH = XT_SALT(y, outH - 1); + int32_t remLoad = inDataPitch1 * enable2outH; + + /* variables used for store */ + int32_t outVarLen = varLen * enable2outH; + int32_t outVarFlag = outVarLen * typeFlag; + int32_t outVarFlagx2 = outVarFlag * 2; + enable2outH = outDataPitch1 * enable2outH * bytesPerPixel; + + /* Initialize input and output data pointers */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* input pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride]; + + /* Load input data */ + /* InCh =1 */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + /* load data from 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInDataCh1_1, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInDataCh1_2, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInDataCh1_3, vaInData, pdvecIn1, remLoad); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInDataCh1_4, vaInData, pdvecIn1, remLoad); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInDataCh1_5, vaInData, pdvecIn1, remLoad); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is + * 64,65,66,67.........126,127, Data should be arranged as + * + * dvecIn100 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126 + * dvecIn101 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127 + * dvecIn102 : 2, 4, 6,...60,62,0 ,66,68,70,...124,126,0 + * + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 3rd input row. + * + */ + /* Form 2 vectors from the 2 output height rows - row 1 and row3 */ + IVP_DSEL2NX8I(dvecDataCh102, dvecDataCh101, dvecInDataCh1_3, dvecInDataCh1_1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh103 = IVP_SEL2NX8I(dvecDataCh101, dvecDataCh101, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Form 2 vectors from the 2 output height rows - row 2 and row4 */ + IVP_DSEL2NX8I(dvecDataCh112, dvecDataCh111, dvecInDataCh1_4, dvecInDataCh1_2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh113 = IVP_SEL2NX8I(dvecDataCh111, dvecDataCh111, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Form 2 vectors from the 2 output height rows - row 3 and row5 */ + IVP_DSEL2NX8I(dvecDataCh122, dvecDataCh121, dvecInDataCh1_5, dvecInDataCh1_3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh123 = IVP_SEL2NX8I(dvecDataCh121, dvecDataCh121, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* InCh = 2 */ + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + /* load data from 1st input row */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInDataCh2_1, vaInData2, pdvecIn2, inDataPitch1); + + /* load data from 2nd input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInDataCh2_2, vaInData2, pdvecIn2, inDataPitch1); + + /* load data from 3rd input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInDataCh2_3, vaInData2, pdvecIn2, remLoad); + + /* load data from 4th input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInDataCh2_4, vaInData2, pdvecIn2, remLoad); + + /* load data from 5th input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInDataCh2_5, vaInData2, pdvecIn2, remLoad); + + /* Form 2 vectors from the 2 output height rows - row 1 and row3 */ + IVP_DSEL2NX8I(dvecDataCh202, dvecDataCh201, dvecInDataCh2_3, dvecInDataCh2_1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh203 = IVP_SEL2NX8I(dvecDataCh201, dvecDataCh201, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Form 2 vectors from the 2 output height rows - row 2 and row4 */ + IVP_DSEL2NX8I(dvecDataCh212, dvecDataCh211, dvecInDataCh2_4, dvecInDataCh2_2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh213 = IVP_SEL2NX8I(dvecDataCh211, dvecDataCh211, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Form 2 vectors from the 2 output height rows - row 3 and row5 */ + IVP_DSEL2NX8I(dvecDataCh222, dvecDataCh221, dvecInDataCh2_5, dvecInDataCh2_3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh223 = IVP_SEL2NX8I(dvecDataCh221, dvecDataCh221, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* InCh = 3 */ + pdvecIn3 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + /* load data from 1st input row */ + valign vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3); + IVP_LA2NX8_XP(dvecInDataCh3_1, vaInData3, pdvecIn3, inDataPitch1); + + /* load data from 2nd input row */ + vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3); + IVP_LA2NX8_XP(dvecInDataCh3_2, vaInData3, pdvecIn3, inDataPitch1); + + /* load data from 3rd input row */ + vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3); + IVP_LA2NX8_XP(dvecInDataCh3_3, vaInData3, pdvecIn3, remLoad); + + /* load data from 4th input row */ + vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3); + IVP_LA2NX8_XP(dvecInDataCh3_4, vaInData3, pdvecIn3, remLoad); + + /* load data from 5th input row */ + vaInData3 = MORPH_OP_PRIME_2Nx8(pdvecIn3); + IVP_LA2NX8_XP(dvecInDataCh3_5, vaInData3, pdvecIn3, remLoad); + + /* Form 2 vectors from the 2 output height rows - row 1 and row3 */ + IVP_DSEL2NX8I(dvecDataCh302, dvecDataCh301, dvecInDataCh3_3, dvecInDataCh3_1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh303 = IVP_SEL2NX8I(dvecDataCh301, dvecDataCh301, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Form 2 vectors from the 2 output height rows - row 2 and row4 */ + IVP_DSEL2NX8I(dvecDataCh312, dvecDataCh311, dvecInDataCh3_4, dvecInDataCh3_2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh313 = IVP_SEL2NX8I(dvecDataCh311, dvecDataCh311, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Form 2 vectors from the 2 output height rows - row 3 and row5 */ + IVP_DSEL2NX8I(dvecDataCh322, dvecDataCh321, dvecInDataCh3_5, dvecInDataCh3_3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecDataCh323 = IVP_SEL2NX8I(dvecDataCh321, dvecDataCh321, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeffData); + valign vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff); + + /* priming of bias load is done outside the innermost loop*/ + phvecBias = (xb_vecN_2x32v *) (pBiasData); + valign vaBias = IVP_LAN_2X32_PP(phvecBias); + + for (outCh = 0; outCh < numOutCh; outCh += 3) + { + /* In order to handle output depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t out2Ch = outDataPitch2 * enable2ndCh * bytesPerPixel; + int32_t out3Ch = outDataPitch2 * enable3rdCh * bytesPerPixel * 2; + + /* Load the bias values corresponding to three output channels */ + xb_vecN_2x32v hvecBias; IVP_LAVN_2X32_XP(hvecBias, vaBias, phvecBias, 3 * 4); + xb_vecN_2x32v hvecBias1 = IVP_REPN_2X32(hvecBias, 0); + xb_vecN_2x32v hvecBias2 = IVP_REPN_2X32(hvecBias, 1); + xb_vecN_2x32v hvecBias3 = IVP_REPN_2X32(hvecBias, 2); + + /* load all the 3x3 coefficients for outChannel - 1 and outChannel - 2 */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff, 2 * coeffPitch3); + /* select 3x3 coefficients for outChannel - 2 */ + dvecCoeffData2 = IVP_SEL2NX8(dvecCoeffData1, dvecCoeffData1, dvecSeq); + /* load all the 3x3 coefficients for outChannel - 3*/ + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData, pdvecCoeff, coeffPitch3 * enable3rdCh); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + xb_vec2Nx24 dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + xb_vec2Nx24 dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecDataCh111, dvecDataCh103, dvecDataCh102, dvecDataCh101, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + MORPH_OP_MULQA(dacc2, dvecDataCh111, dvecDataCh103, dvecDataCh102, dvecDataCh101, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + MORPH_OP_MULQA(dacc3, dvecDataCh111, dvecDataCh103, dvecDataCh102, dvecDataCh101, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecDataCh122, dvecDataCh121, dvecDataCh113, dvecDataCh112, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MULQA(dacc2, dvecDataCh122, dvecDataCh121, dvecDataCh113, dvecDataCh112, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + MORPH_OP_MULQA(dacc3, dvecDataCh122, dvecDataCh121, dvecDataCh113, dvecDataCh112, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecDataCh203, dvecDataCh202, dvecDataCh201, dvecDataCh123, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + MORPH_OP_MULQA(dacc2, dvecDataCh203, dvecDataCh202, dvecDataCh201, dvecDataCh123, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + MORPH_OP_MULQA(dacc3, dvecDataCh203, dvecDataCh202, dvecDataCh201, dvecDataCh123, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecDataCh221, dvecDataCh213, dvecDataCh212, dvecDataCh211, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + MORPH_OP_MULQA(dacc2, dvecDataCh221, dvecDataCh213, dvecDataCh212, dvecDataCh211, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + + MORPH_OP_MULQA(dacc3, dvecDataCh221, dvecDataCh213, dvecDataCh212, dvecDataCh211, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecDataCh302, dvecDataCh301, dvecDataCh223, dvecDataCh222, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + MORPH_OP_MULQA(dacc2, dvecDataCh302, dvecDataCh301, dvecDataCh223, dvecDataCh222, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + + MORPH_OP_MULQA(dacc3, dvecDataCh302, dvecDataCh301, dvecDataCh223, dvecDataCh222, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecDataCh313, dvecDataCh312, dvecDataCh311, dvecDataCh303, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc2, dvecDataCh313, dvecDataCh312, dvecDataCh311, dvecDataCh303, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + MORPH_OP_MULQA(dacc3, dvecDataCh313, dvecDataCh312, dvecDataCh311, dvecDataCh303, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + + /* Multiply and accumulate 7th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecDataCh323, dvecDataCh322, dvecDataCh321, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + MORPH_OP_MULQA(dacc2, 0, dvecDataCh323, dvecDataCh322, dvecDataCh321, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + MORPH_OP_MULQA(dacc3, 0, dvecDataCh323, dvecDataCh322, dvecDataCh321, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 6)); + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut1H; + xb_vec2Nx8 dvecOut2L, dvecOut2H; + xb_vec2Nx8 dvecOut3L, dvecOut3H; + +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, remX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + out2Ch); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, remX * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + out3Ch); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, remX * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2outH); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, (outVarLen - outVarFlag)); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, outVarFlagx2); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + out2Ch + enable2outH); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable2ndCh * (outVarLen - outVarFlag)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * outVarFlagx2); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + out3Ch + enable2outH); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable3rdCh * (outVarLen - outVarFlag)); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * outVarFlagx2); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 3 * outDataPitch2 * bytesPerPixel; + } + } + } +} +/****************** xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_3x3j2d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 1); + XAI_CHECK_STRIDE(param, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_3x3j2d1), S8IX_MOW_WHD_INCHANNEL3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecCoeff3; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* Generating the shuffle pattern for coefficent loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* loop across output depth is unrolled by 3 + * , producing lanes from 3 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the output vector gives the next output row. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH - 1; y += 2) /* Loop across output height */ + { + /* Initialize i/p and o/p data pointers */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */ + { + /* In order to handle odd output depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + + /* priming of coeff load is done outside the innermost loop*/ + /* Coeff for 1st output channel */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + /* Coeff for 2nd output channel */ + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + /* Coeff for 3rd output channel */ + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + /* Input vector pointer initialization */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* load data from 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn1, inDataPitch2 - (4 * inDataPitch1)); + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + + /* Rearrange them so that zero is inserted where the MULQ should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is + * 64,65,66,67.........126,127, Data should be arranged as + * + * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126 + * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127 + * dvecData3 : 3, 4, 6,...60,62,0 ,66,68,70,...124,126,0 + * + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + /* Form 2 vectors from the 2 output height rows - row 1 and row3 */ + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + + /* Form 2 vectors from the 2 output height rows - row 2 and row4 */ + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + + /* Form 2 vectors from the 2 output height rows - row 3 and row5 */ + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++) */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable2ndCh * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable3rdCh * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 3 * outDataPitch2 * bytesPerPixel; + pCoeff += 3 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 3) */ + } /* end of for (y = 0; y < outH; y += 2) */ + if (y < outH) + { + /* Initialize i/p and o/p data pointers */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */ + { + /* In order to handle odd output depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + + /* priming of coeff load is done outside the innermost loop*/ + /* Coeff for 1st output channel */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + /* Coeff for 2nd output channel */ + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + /* Coeff for 3rd output channel */ + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + /* Input vector pointer initialization */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + + /* load data from 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn1, inDataPitch2 - 2 * inDataPitch1); + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + + /* Rearrange them so that zero is inserted where the MULQ should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is + * 64,65,66,67.........126,127, Data should be arranged as + * + * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126 + * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127 + * dvecData3 : 2, 4, 6,...60,62,0 ,66,68,70,...124,126,0 + * + * + */ + + /* Form 2 vectors from the 2 output height rows - row 1 and row3 */ + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + + /* Form 2 vectors from the 2 output height rows - row 2 and row4 */ + IVP_DSEL2NX8I(dvecData2, dvecData1, 0, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + + /* Form 2 vectors from the 2 output height rows - row 3 and row5 */ + IVP_DSEL2NX8I(dvecData2, dvecData1, 0, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, 0, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++) */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 3 * outDataPitch2 * bytesPerPixel; + pCoeff += 3 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 3) */ + } /* end of if(y < outH) */ + } /* end of for (x = 0; x < outW; x += vectorizationWidth) */ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_3x3j4d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 3x3 3D dilated convolution function and 3x3 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_3x3j4d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_3x3j4d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_3x3j4d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_3x3j4d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 1); + XAI_CHECK_STRIDE(param, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Pitches of Coefficient Data (WHDN) in dim1, dim2 and dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + MORPH_IDT_2Nx8 *restrict pdvecInp1; + MORPH_IDT_2Nx8 *restrict pdvecInp2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* accumulators for 2 output channels */ + xb_vec2Nx24 dacc1, dacc2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + int32_t varLen; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Generating the shuffle pattern for coefficent loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* loop across output depth is unrolled by 2 + * , producing lanes from 2 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the output vector gives the next output row. + */ + + /* Loop structure Start with loop across output channels */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across output Height */ + { + /* To handle the odd number of output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* Initialize i/p and o/p data pointers */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y * stride + x * stride]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */ + { + /* To handle cases where outCh is non-multiple of 2 */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Input data vectors to generate 2 rows of output */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3; + /* Input data vectors for 1st row of output */ + xb_vec2Nx8 dvecIn11, dvecIn12; + /* Input data vectors for 2nd row of output */ + xb_vec2Nx8 dvecIn21, dvecIn22; + /* vectors for coefficients */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + valign vaData, vaOutData; + + /* load and replicate bias data for each output channel */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* Initialize all the accumulators with bias values */ + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of 1st channel coeffs load */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + /* priming of 2nd channel coeffs load */ + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Starting location initialized for the input data */ + pdvecInp1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecInp2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* Loading of coefficients for 2 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* Rearrange them so that zero is inserted where the MULQ should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* Loading first output row input data */ + vaData = MORPH_OP_PRIME_2Nx8(pdvecInp1); + MORPH_OP_LOAD_2Nx8(dvecIn11, vaData, pdvecInp1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecIn12, vaData, pdvecInp1, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Loading second output row input data */ + vaData = MORPH_OP_PRIME_2Nx8(pdvecInp2); + MORPH_OP_LOAD_2Nx8(dvecIn21, vaData, pdvecInp2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecIn22, vaData, pdvecInp2, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...124,128,132,136,...252 + * dvecData2 : 1, 5, 9,...125,129,133,137,...253 + * dvecData3 : 2, 6,10,...126,130,134,138,...254 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + IVP_DSEL2NX8I(dvecIn12, dvecIn11, dvecIn12, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecIn22, dvecIn21, dvecIn22, dvecIn21, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, \ + dvecIn21, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData2 = IVP_SEL2NX8I(dvecIn22, dvecIn12, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Load second row for 1st output row*/ + vaData = MORPH_OP_PRIME_2Nx8(pdvecInp1); + MORPH_OP_LOAD_2Nx8(dvecIn11, vaData, pdvecInp1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecIn12, vaData, pdvecInp1, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Load second row for 2nd output row*/ + vaData = MORPH_OP_PRIME_2Nx8(pdvecInp2); + MORPH_OP_LOAD_2Nx8(dvecIn21, vaData, pdvecInp2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecIn22, vaData, pdvecInp2, inDataPitch1 - (2 * XCHAL_IVPN_SIMD_WIDTH * flag)); + + IVP_DSEL2NX8I(dvecIn12, dvecIn11, dvecIn12, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecIn22, dvecIn21, dvecIn22, dvecIn21, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, \ + dvecIn21, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData2 = IVP_SEL2NX8I(dvecIn22, dvecIn12, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Load third row for 1st output row*/ + vaData = MORPH_OP_PRIME_2Nx8(pdvecInp1); + MORPH_OP_LOAD_2Nx8(dvecIn11, vaData, pdvecInp1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecIn12, vaData, pdvecInp1, inDataPitch2 - 2 * inDataPitch1 - \ + (2 * XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Load third row for 2nd output row*/ + vaData = MORPH_OP_PRIME_2Nx8(pdvecInp2); + MORPH_OP_LOAD_2Nx8(dvecIn21, vaData, pdvecInp2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecIn22, vaData, pdvecInp2, inDataPitch2 - 2 * inDataPitch1 - \ + (2 * XCHAL_IVPN_SIMD_WIDTH * flag)); + + IVP_DSEL2NX8I(dvecIn12, dvecIn11, dvecIn12, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecIn22, dvecIn21, dvecIn22, dvecIn21, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, \ + dvecIn21, dvecIn11, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData2 = IVP_SEL2NX8I(dvecIn22, dvecIn12, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, 0, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* END for (inCh = 0; inCh < numInCh; inCh++) */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Store the first row , first output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the first row , 2nd output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Get the 2nd output row elements which are in the upper half of output vectors */ + dvecOut1L = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecOut1L, IVP_SELI_8B_EXTRACT_HI_HALVES); + dvecOut2L = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecOut2L, IVP_SELI_8B_EXTRACT_HI_HALVES); + + /* Store the 2nd row , 1st output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the 2nd row 32 outputs from 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, (-typeFlag + 1) * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* END for (outCh = 0; outCh < numOutCh; outCh += 2) */ + } /* END for (y = 0; y < outH; y += 2) */ + } /* END for (x = 0; x < outW; x += vectorizationWidth ) */ + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_3x3j1d2I8S8IX_MOW_WHD +* **************************************************************************/ + +/********************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 3D convolution with*/ +/* dilation = 2. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method can be used to generate 3x3 3D dilated convolution */ +/* function and 3x3 3D VQ dilated convolution function for U8 bit */ +/* and S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/********************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_3x3j1d2_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_3x3j1d2_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_3x3j1d2_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_3x3j1d2_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_TILE3D_EDGE(inTile, 2); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 2); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Since the dilation value > 1 , */ + /* Effective Kernel size = dilation(KernelSize - 1) + 1 */ + /* Effective kernel size is used for calculating the min required edge */ + int32_t dilatedKSizeU = dilationU * (kSizeU - 1) + 1; + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedKSizeU / 2) * inDataPitch1 + (dilatedKSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeU + 1; + + /* Generating the shuffle pattern for coefficient loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* Generating two select interleave pattern to apply on accumulator values just before storing + * For 8 bit output + * Pattern1 = 0 64 1 65 2 66 .... 31 95 + * Pattern2 = 32 96 33 97 34 98 ... 63 127 + * For 16 bit output + * Pattern1 = 0 1 64 65 2 3 66 67 .... 30 31 94 95 + * Pattern2 = 32 33 96 97 34 35 98 99 ... 62 63 126 127 + */ + /* 0 1 2 3 .. 62 63*/ + xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8(); + /* 65 66 67 ...126 127*/ + xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 2 * XCHAL_IVPN_SIMD_WIDTH); + + if (!typeFlag) + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_INTERLEAVE_1); + } + + + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + /* variable length for output stores */ + int32_t remX = XT_MIN(vectorizationWidth, outW - x); + + /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH, + * i.e. if the number of input data bytes corresponding to remX number of outputs + * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load + * the next 64 input bytes*/ + int32_t remXLoad = ((remX + dilatedKSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0; + + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Input vector pointer initialization */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + /* Load 128 bytes from row corresponding to each ky + * dvecInData11 = a0 a1 a2 a3.... a63 + * dvecInData12 = a64 a65 a66 .... a127 + * + * Separate odd and even indices + * dvecInData11 = a0 a2 a4 a6.... a126 + * dvecInData12 = a1 a3 a5 a7.... a127 + * + * Let the coefficients be + * C0 C1 C2 + * C3 C4 C5 + * C6 C7 C8 + * + * acc11 = [a0 a2 a4 a6.... a126] * C0 + + * [a2 a4 a6.... a126 X ] * C1 + + * [a4 a6.... a126 X X ] * C2 + * + * acc12 = [a1 a3 a5 a7.... a127] * C0 + + * [a3 a5 a7.... a127 X ] * C1 + + * [a5 a7.... a127 X X ] * C2 + * Continue the same multiplication steps for ky = 1 [C3 C4 C5] and ky =2 [C6 C7 C8]. + * acc11 and acc12 contains convolved output corresponding to even and odd indices + * respectively at the end of inchannel loop iterations. + * + * acc11 and acc12 are interleaved to obtain the outputs in correct order. + * + */ + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + MORPH_IDT_2Nx8 dvecInData31, dvecInData32; + + /* load data 128 bytes from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + /* Separate odd and even indices */ + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data 128 bytes from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + /* Separate odd and even indices */ + MORPH_OP_DSELI(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data 128 bytes from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, inDataPitch2 - \ + dilationU * 2 * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + /* Separate odd and even indices */ + MORPH_OP_DSELI(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9); + + /* Rearrange them so that zero is inserted where the MUL4T should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L; + xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1); + xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2); + xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1); + xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, \ + ((bytesPerPixel * remX) - 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + + + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_3x3j1d4I8S8IX_MOW_WHD +* **************************************************************************/ + +/********************************************************************************/ +/* Description : P6 optimized generic implementation for 3x3 3D convolution with*/ +/* dilation = 4. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method can be used to generate 3x3 3D convolution dilated */ +/* function and 3x3 3D VQ convolution dilated function for U8 bit */ +/* and S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 3x3xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/********************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_3x3j1d4_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_3x3j1d4_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_3x3j1d4_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_3x3j1d4_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_3x3j1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 3); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_TILE3D_EDGE(inTile, 4); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 4); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Since the dilation value > 1 , */ + /* Effective Kernel size = dilation(KernelSize - 1) + 1 */ + /* Effective kernel size is used for calculating the min required edge */ + int32_t dilatedKSizeU = dilationU * (kSizeU - 1) + 1; + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedKSizeU / 2) * inDataPitch1 + (dilatedKSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeU + 1; + + /* Generating the shuffle pattern for coefficient loads. + The idea is to populate zero value where the MUL4T should not affect + Pattern : 0 1 2 32 3 4 5 32 6 7 8 32 X X X .... */ + xb_vec2Nx8 dvecIdx = IVP_SEL2NX8I(32, IVP_MOV2NX8_FROMNX16( \ + IVP_MOVNX16T(IVP_ADDNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), + IVP_MULNX16PACKL(IVP_SRLINX16(IVP_SEQNX16(), 2), 3)), 32, + IVP_NEQNX16(IVP_ANDNX16(IVP_SEQNX16(), 3), 3))), + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); + + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + /* variable length for output stores */ + int32_t remX = XT_MIN(vectorizationWidth, outW - x); + + /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH, + * i.e. if the number of input data bytes corresponding to remX number of outputs + * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load + * the next 64 input bytes*/ + int32_t remXLoad = ((remX + dilatedKSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0; + + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Input vector pointer initialization */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + /* Load 128 bytes from row corresponding to each ky + * dvecInData11 = a0 a1 a2 a3 ... a63 + * dvecInData12 = a64 a65 a66 ... a127 + * + * Deinterleave the indices + * dvecInData11 = a0 a2 a4 a6 ... a126 + * dvecInData12 = a1 a3 a5 a7 ... a127 + * + * Deinterleave the indices + * dvecInData11 = a0 a4 a8 ... a124 ... a1 a5 ... a125 + * dvecInData12 = a2 a6 a10 ... a126 ... a3 a7 ... a127 + * + * Let the coefficients be + * C0 C1 C2 + * C3 C4 C5 + * C6 C7 C8 + * + * acc11 = [a0 a4 a8 ... a124 ... a1 a5 ... a125] * C0 + + * [a4 a8 ... a124 ... a1 a5 ... a125 X ] * C1 + + * [a8 ... a124 ... a1 a5 ... a125 X X ] * C2 + * + * acc12 = [a2 a6 a10 ... a126 ... a3 a7 ... a127] * C0 + + * [a6 a10 ... a126 ... a3 a7 ... a127 X ] * C1 + + * [a10 ... a126 ... a3 a7 ... a127 X X ] * C2 + * Continue the same multiplication steps for ky = 1 [C3 C4 C5] and ky =2 [C6 C7 C8]. + * acc11 and acc12 contains convolved output corresponding to even and odd indices + * respectively at the end of inchannel loop iterations. + * + * acc11 and acc12 are interleaved to obtain the outputs in correct order. + * + * Follow same steps for obtaining outputs corresponding to next output channel + */ + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + MORPH_IDT_2Nx8 dvecInData31, dvecInData32; + + /* load data 128 bytes from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + /* Separate odd and even indices */ + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data 128 bytes from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + dilationU * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + /* Separate odd and even indices */ + MORPH_OP_DSELI(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data 128 bytes from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, inDataPitch2 - \ + dilationU * 2 * inDataPitch1 - remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + /* Separate odd and even indices */ + MORPH_OP_DSELI(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load all the 3x3 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 9); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 9); + + /* Rearrange them so that zero is inserted where the MUL4T should not have effect */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* Multiply and accumulate 1st set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData21, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 3 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData31, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + /* 8 bit output */ + if (!typeFlag) + { + /* + * dacc11 and dacc12 contains accumulated values corresponding to same output row. + * For 8bit output, dvecOutL contains the required output elements + * dvecOut1L = [A0 A4 A8 ... A116 X X A1 A5 ... A117 X X] - 64 elements + * dvecOut2L = [A2 A6 A10 ...A118 X X A3 A7 ... A119 X X] - 64 elements + * Interleave the elements + * dvecOut1L = [A0 A2 A4 ... A116 A117 X X X X ] - 64 elements + * dvecOut2L = [A1 A3 A7 ... A118 A119 X X X X ] - 64 elements + * Interleave the elements + * dvecOut1L = [A0 A1 A2 A3 ... ]- 64 elements + * dvecOut2L = [ ... A116 A117 A118 A119 X X X X X X X X ]- 64 elements + * + * Same steps for ouputs corresponding to second output channel. + */ + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else /* 16bit output */ + { + /* + * dacc11 and dacc12 contains accumulated values corresponding to same output row. + * dvecOut1L = [A0 A4 A8 ... A116 X X] - 32 16b elements + * dvecOut1H = [A1 A5 A9 ... A117 X X] - 32 16b elements + * dvecOut2L = [A2 A6 A10 ... A118 X X] - 32 16b elements + * dvecOut2H = [A3 A7 A11 ... A119 X X] - 32 16b elements + * Interleave the elements of dvecOut1L and dvecOut1H + * dvecOut1L = [A0 A1 A4 A5 ... ] + * dvecOut1H = [ ... A116 A117 X X] + * Interleave the elements of dvecOut2L and dvecOut2H + * dvecOut2L = [A2 A3 A6 A7 ...] + * dvecOut2H = [ ... A118 A119 X X] + * Interleave2 the elements of dvecOut2L and dvecOut1L + * dvecOut1L = [A0 A1 A2 A3 ... ] + * dvecOut2L = [A32 A33 A34 A35 ... ] + * Interleave2 the elements of dvecOut2H and dvecOut1H + * dvecOut1H = [A64 A65 A66 A67 ... ] + * dvecOut2H = [ ... A116 A117 A118 A119 X X X X X X X X] + * + * Same steps for outputs corresponding to second output channel. + */ + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \ + IVP_DSELI_INTERLEAVE_2); + } + + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * remX) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MOW fold 16 Stride 1 * +* If inDataPitch1 is lesser than or equal to * +* 16 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + topEdge = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* Select sequence to re-arrange input data */ + xb_vec2Nx8 dvecSeq = 0; + IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)); + IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1), 64, \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1))); + + /* loop across output channels are unrolled twice and 4 rows are accessed simultaneously + * to produce four output vectors in 1 iteration + */ + for (y = 0; y < outH - 3; y += 4) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5; + + /* Initialize input data pointer */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pInput += inDataPitch2; + + /* load data from 4 input rows [Row0 | Row1 | Row2 | Row3] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, 4 * inDataPitch1); + + /* load data from next 4 input rows [Row4 | Row5 | Row6 | Row7] */ + MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData1, pdvecIn1, 4 * inDataPitch1); + + /* dvecInData1 contains first 4 input rows + * dvecInData1: row0 | row1 | row2 | row3 + * + * dvecInData5 contains next 4 input rows + * dvecInData5: row4 | row5 | row6 | row7 + * + * Input data is re arranged in such a manner that + * dvecInData2 contains: row1 | row2 | row3 | row4 + * dvecInData3 contains: row2 | row3 | row4 | row5 + * dvecInData4 contains: row3 | row4 | row5 | row6 + */ + + /*Compute row [Row1 | Row2 | Row3 | Row4] */ + dvecInData2 = IVP_SEL2NX8(dvecInData5, dvecInData1, dvecSeq); + + /*Compute row [Row2 | Row3 | Row4 | Row5] */ + dvecInData3 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, dvecSeq), dvecInData2, dvecSeq); + + /*Compute row [Row3 | Row4 | Row5 | Row6] */ + dvecInData4 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, IVP_ADD2NX8(dvecSeq, inDataPitch1)), dvecInData3, dvecSeq); + + /* load all the 4x4 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16); + + /* Compute the output of 4 output rows, for the 1st output depth */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Compute the output of 4 output rows, for the 2nd output depth */ + MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; + +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, third row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, fourth row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh \ + * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh \ + * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 3 * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh \ + * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + + if (y < outH) /* Handle the case when less than 4 output rows need to be processed */ + { + int32_t enable2ndRow = XT_SALT(y, outH - 1); + int32_t enable3rdRow = XT_SALT(y, outH - 2); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths and heights */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5; + + /* load the remaining input rows */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, 4 * inDataPitch1); + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData5, vaInData1, pdvecIn1, (enable3rdRow + enable2ndRow) * inDataPitch1); + pInput += inDataPitch2; + + /* load all the 4x4 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16); + + + /* dvecInData1 contains first 4 input rows and + * dvecInData1: row0 | row1 | row2 | row3 + * dvecInData5: row4 | row5 | row6 | row7 + * row6 and row7 of dvecInData5 are always disabled. + * + * Input data is re arranged in such a manner that + * dvecInData2 contains: row1 | row2 | row3 | row4 + * dvecInData3 contains: row2 | row3 | row4 | row5 + * dvecInData4 contains: row3 | row4 | row5 | row6 + */ + /*Compute row [Row1 | Row2 | Row3 | Row4] */ + dvecInData2 = IVP_SEL2NX8(dvecInData5, dvecInData1, dvecSeq); + + /*Compute row [Row2 | Row3 | Row4 | Row5] */ + dvecInData3 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, dvecSeq), dvecInData2, dvecSeq); + + /*Compute row [Row3 | Row4 | Row5 | Row6] */ + dvecInData4 = IVP_SEL2NX8(IVP_SEL2NX8(0, dvecInData5, IVP_ADD2NX8(dvecSeq, inDataPitch1)), dvecInData3, dvecSeq); + + + /* Multiply input data with coefficients from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Multiply input data with coefficients from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch1 * enable3rdRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, 3rd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + 2 * outDataPitch1 * enable3rdRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } //if(y < outH) +} + +/****************************************************************************************** +* MOW fold 32 Stride 1 * +* If inDataPitch1 is lesser than or equal to * +* 32 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + topEdge = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + int32_t inDataPitch1_2X = 2 * inDataPitch1; + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* in order to hanlde odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4; + + /* load data from first 2 input rows */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1); + pInput += inDataPitch2; + + /* load data from 2 input rows [Row0 | Row1] */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData1, pdvecIn1, inDataPitch1_2X); + + /* load data from 2 input rows [Row1 | Row2] */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData2, pdvecIn2, inDataPitch1_2X); + + /* load data from next 2 input rows [Row2 | Row3] */ + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData1, pdvecIn1, inDataPitch1_2X); + + /* load data from next 2 input rows [ex: Row3 | Row4] */ + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData2, pdvecIn2, inDataPitch1_2X); + + /* load all the 4x4 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16); + + /* 4 vector loads are used to load 5 rows of input. Two output channels are + processed at a time. */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + MORPH_OP_MUL4TA(dacc2, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2Row * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut2H, dvecOut2L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), + vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * \ + enable2Row * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ +} + +/***************************************************************************** +* xaiConvolved(VQ)3D_S_4x4j1d1I8S8IX_MOW_WHD +* **************************************************************************/ +/********************************************************************************/ +/* Description : P6 optimized generic implementation for 4x4 3D convolution with*/ +/* dilation = 1. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method can be used to generate 4x4 3D convolution dilated */ +/* function and 4x4 3D VQ convolution dilated function for U8 bit */ +/* and S8 bit input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 4x4xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/********************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_4x4j1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_4x4j1d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_4x4j1d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_4x4j1d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_4x4j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 4); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* check inDataPitch1, if it is less than or equal to 16, + * call FOLD32 variant otherwise continue + */ + + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* check inDataPitch1, if it is less than or equal to 32, + * call FOLD32 variant otherwise continue + */ + + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_4x4j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + int32_t leftEdge, topEdge; + + leftEdge = leftEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + topEdge = topEdgeFlag ? (kSizeU / 2) : ((kSizeU / 2) - 1); + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* loop across output channels and output height are unrolled twice + * to produce four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* in order to handle odd output height */ + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc21; + + dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + + dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Input vector pointer initialization */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch2 - (3 * inDataPitch1)); + + /* load all the 4x4 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, 16); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, 16); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + MORPH_OP_MUL4TA(dacc21, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + MORPH_OP_MUL4TA(dacc21, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2ndCh * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* 5x5 MOW WHD Stride 1 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitch of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff; + + /* Variable Declarations */ + int32_t outCh, x, y; + int32_t varLen; + + /* Vectorization width is 124 */ + const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* The number of inchannels is 3. In the implementation 3 channels of + * coefficient tile is loaded into two vectors and select operation + * the values are arranged so that quad muls can be used. + * First load(dveccoeff1) is used to load two channels of 5x5 coefficient + * Second load(dveccoeff2) to load the third channel. + * In dveccoeff1, 0 to 24 indices corresponds to channel 1 and 25 to 49 + * corresponds to channel 2. + * Select pattern : + * Pattern 1 : + * 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 ,20,21,22,23 , + * 25,26,27,28, 30,31,32,33, 35,36,37,38, 40,41,42,43, 45,46,47,48 + * + * Pattern 2 : + * 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18, 20,21,22,23 , + * 4,9,14,19, (4,9,14,19)+64, (4,9,14,19)+64+25, 24, 24+64, 24+64+25 + */ + + + + xb_vec2Nx8 dvecPattern1, dvecPattern2, dvecTempPattern, dvecSelPattern; + /*Pattern1 : 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 ,20,21,22,23 , + 25,26,27,28, 30,31,32,33, 35,36,37,38, 40,41,42,43, 45,46,47,48 */ + dvecPattern1 = IVP_ADD2NX8(IVP_SEQ2NX8(), IVP_SRLI2NX8(IVP_SEQ2NX8(), 2)); + + /*Pattern2 : 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18, 20,21,22,23 , + 4,9,14,19, (4,9,14,19)+64, (4,9,14,19)+64+25, 24, 24+64, 24+64+25 */ + + /* dvecTempPattern 4,9,14,19, (4,9,14,19)+64, (4,9,14,19)+64+25*/ + dvecTempPattern = IVP_SLLI2NX8(IVP_ADD2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 3), 1), 2); + dvecTempPattern = IVP_ADD2NX8(dvecTempPattern, IVP_AND2NX8(IVP_SEQ2NX8(), 3)); + IVP_ADD2NX8T(dvecTempPattern, dvecTempPattern, (2 * XCHAL_IVPN_SIMD_WIDTH), IVP_NOTB(IVP_LTR2N(4))); + IVP_ADD2NX8T(dvecTempPattern, dvecTempPattern, 25, IVP_NOTB(IVP_LTR2N(8))); + dvecSelPattern = IVP_SEQ2NX8(); + IVP_ADD2NX8T(dvecSelPattern, dvecSelPattern, ((xb_vec2Nx8U) (2 * XCHAL_IVPN_SIMD_WIDTH - 20)), IVP_NOTB(IVP_LTR2N(20))); + dvecPattern2 = IVP_SEL2NX8(dvecTempPattern, dvecPattern1, dvecSelPattern); + dvecSelPattern = IVP_SEQ2NX8(); + IVP_ADD2NX8T(dvecSelPattern, dvecSelPattern, ((xb_vec2Nx8U) (2 * XCHAL_IVPN_SIMD_WIDTH - 32)), IVP_NOTB(IVP_LTR2N(32))); + dvecPattern2 = IVP_SEL2NX8(IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32 \ + (24 + (88 << 8) + (113 << 16))), \ + dvecPattern2, + dvecSelPattern); + + /* loop across output height is unrolled twice and loops across inchannels, + * kernel width and kernel height are completely unrolled + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + varLen = XT_MIN(vectorizationWidth, outW - x); + /* In order to handle cases where input width <= 2*XCHAL_IVPN_SIMD_WIDTH, where + * the 2nd load from the same row needs to be avoided. */ + int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1); + + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = dacc2 = dacc3 = dacc4 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc3, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc4, hvecBias1, hvecBias1); + + /* Coefficient and Input data pointers */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff); + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + MORPH_IDT_2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51; + MORPH_IDT_2Nx8 dvecInData61, dvecInData71, dvecInData81, dvecInData91, dvecInDataA1; + MORPH_IDT_2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52; + MORPH_IDT_2Nx8 dvecInData62, dvecInData72, dvecInData82, dvecInData92, dvecInDataA2; + + /* load 5x5 coefficients from three channels*/ + valign vaCoeffData; vaCoeffData = IVP_LA2NX8_PP(pdvecCoeff); + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData, pdvecCoeff, 2 * coeffPitch2); + IVP_LA2NX8_IP(dvecCoeffData2, vaCoeffData, pdvecCoeff); + + /* Rearrange them so that 3 x 4 MUL4T, 4 MULQ can be used to perform entire operation */ + dvecCoeffData2 = IVP_SEL2NX8(dvecCoeffData1, dvecCoeffData2, dvecPattern2); + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecPattern1); + + /* Input Channel 1*/ + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + /* Input data in channel 1, corresponding to 24th coefficient */ + dvecInData71 = IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData72 = IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData81 = IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData82 = IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* Input Channel 2*/ + /* load data from first input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + + MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + + MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + + MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + + /* Input data in channel 2, corresponding to 24th coefficient */ + dvecInData91 = IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData92 = IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInDataA1 = IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInDataA2 = IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* Input Channel 3*/ + /* load data from first input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* Multiply and accumulate the data corresponding to 24th coefficient */ + MORPH_OP_MULQA(dacc1, 0, dvecInData91, dvecInData71, + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + MORPH_OP_MULQA(dacc2, 0, dvecInData92, dvecInData72, + IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + MORPH_OP_MULQA(dacc3, 0, dvecInDataA1, dvecInData81, + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + MORPH_OP_MULQA(dacc4, 0, dvecInDataA2, dvecInData82, + IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 1st row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * \ + (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \ + (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ +} + + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j1d1I8S8IX_MOW_WHD_FOLD16 +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1. */ +/* If inDataPitch1 <= 16, this function variant is called. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD16) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitche of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 + * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data, + * So that last coeff from first 4 rows of coeffs can be used as one + * 32 byte element and make use of quad multiplier outside the inner- + * most loop. + * c11, c12, c13, c14, c15 + * c21, c22, c23, c24, c25 + * c31, c32, c33, c34, c35 + * c41, c42, c43, c44, c45 + * c51, c52, c53, c54, c55 + * + * c15, c25, c35, c45 and c55 are placed in contiguous fashion, so that + * c15, c25, c35, and c45 can be used as one 32 byte element + */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \ + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* Select sequence to re-arrange input data */ + xb_vec2Nx8 dvecSeq1 = 0; + IVP_ADD2NX8T(dvecSeq1, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)); + IVP_ADD2NX8T(dvecSeq1, IVP_SUB2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1), 2 * XCHAL_IVPN_SIMD_WIDTH, \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1))); + + xb_vec2Nx8 dvecSeq2 = 0; + IVP_ADD2NX8T(dvecSeq2, IVP_SEQ2NX8(), 2 * inDataPitch1, \ + IVP_LT2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1)); + IVP_ADD2NX8T(dvecSeq2, IVP_SUB2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1), 2 * XCHAL_IVPN_SIMD_WIDTH, \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1))); + + xb_vec2Nx8 dvecSeq3 = 0; + IVP_ADD2NX8T(dvecSeq3, IVP_SEQ2NX8(), 3 * inDataPitch1, \ + IVP_LT2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1)); + IVP_ADD2NX8T(dvecSeq3, IVP_SUB2NX8(IVP_SEQ2NX8(), inDataPitch1), 2 * XCHAL_IVPN_SIMD_WIDTH, \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1))); + + + /* loop across output height is unrolled 4 times and + * loop across kernel width and height is completely unrolled + */ + for (y = 0; y < outH; y += 4) /* Loop across output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + int32_t enable3rdRow = XT_SALT(y, outH - 2); + int32_t enable4thRow = XT_SALT(y, outH - 3); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5; + + /* load data from 5 rows */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch2 - (4 * inDataPitch1)); + + /* dvecInData1: row0 | row1 | row2 | row3 + * dvecInData2: row1 | row2 | row3 | row4 + * dvecInData3: row2 | row3 | row4 | row5 + * dvecInData4: row3 | row4 | row5 | row6 + * dvecInData5: row4 | row5 | row6 | row7 + */ + + /* load all the 5x5 coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + + /*Rearrange them so that 4 MUL4T, 1 MULQ & 1 MUL can be used to perform entire operation*/ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + + /* Multiply and accumulate using 1st set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate using 2nd set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Multiply and accumulate using 3rd set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply and accumulate using 4th set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Multiply and accumulate using 5th set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + /* Multiply and accumulate using 6th set of 4 coefficients */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Multiply and accumulate using the final coefficient */ + MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut1H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first row output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second row output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, \ + IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third row output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable3rdRow * 2 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, \ + IVP_ADD2NX8(IVP_SEQ2NX8(), 2 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable3rdRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the fourth row output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable4thRow * 3 * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, \ + IVP_ADD2NX8(IVP_SEQ2NX8(), 3 * inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable4thRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh ++)*/ + } /* end of for (y = 0; y < outH; y += 4)*/ +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j1d1I8S8IX_MOW_WHD_FOLD32 +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1. */ +/* If inDataPitch1 <= 32, this function variant is called. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitche of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 + * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data, + * So that last coeff from first 4 rows of coeffs can be used as one + * 32 byte element and make use of quad multiplier outside the inner- + * most loop. + * c11, c12, c13, c14, c15 + * c21, c22, c23, c24, c25 + * c31, c32, c33, c34, c35 + * c41, c42, c43, c44, c45 + * c51, c52, c53, c54, c55 + * + * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that + * c15, c25, c35, and c45 can be used as one 32 byte element + */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \ + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + + /* 3 Load operations are done t load the data */ + /* 1st vector load vec1 - row0 | row1 */ + /* 2nd vector load vec3 - row2 | row3 */ + /* 3rd vector load vec5 - row4 | row5 */ + /* Select operation is used to get vec2 from */ + /* vec1 and vec3; and vec4 from vec3 and vec5. */ + /* vec2 - row1 | row2 */ + /* vec4 - row3 | row4 */ + + xb_vec2Nx8 dvecSavSeq = IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel); + /* loop across output height is unrolled twice and loop across kernel width and height is + * completely unrolled + */ + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5; + + /* load data from five input rows */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1); + MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch2 - (4 * inDataPitch1)); + + /* load all the 5x5 coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + + /*Rearrange them so that 4 MUL4T, 1 MULQ & 1 MUL can be used to perform entire operation*/ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + + /* Multiply and accumulate using 1st set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate using 2nd set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Multiply and accumulate using 3rd set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply and accumulate using 4th set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Multiply and accumulate using 5th set of 4 coefficients */ + MORPH_OP_MUL4TA(dacc1, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + /* Multiply and accumulate using 6th set of 4 coefficients */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Multiply and accumulate using the final coefficient */ + MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L; + xb_vec2Nx8 dvecOut1H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first row output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second row output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, dvecSavSeq), vaOutData, pdvecOut, \ + bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ +} +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j1d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_5x5j1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_5x5j1d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_5x5j1d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_5x5j1d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 2); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "The accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "The output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 16) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD16) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + else if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitche of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + int32_t varLen; + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth60 = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + const int32_t vectorizationWidth124 = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 + * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data, + * So that last coeff from first 4 rows of coeffs can be used as one + * 32 byte element and make use of quad multiplier outside the inner- + * most loop. + * c11, c12, c13, c14, c15 + * c21, c22, c23, c24, c25 + * c31, c32, c33, c34, c35 + * c41, c42, c43, c44, c45 + * c51, c52, c53, c54, c55 + * + * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that + * c15, c25, c35, and c45 can be used as one 32 byte element + */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \ + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* loop across output height is unrolled twice. + * Loop across kernel width and height is + * completely unrolled. + * 128 bytes of input are loaded. + */ + for (x = 0; x < outW - vectorizationWidth60; x += vectorizationWidth124) /* Loop across output width */ + { + varLen = XT_MIN(vectorizationWidth124, outW - x); + /* In order to handle cases where input width <= 2*XCHAL_IVPN_SIMD_WIDTH, where + * the 2nd load from the same row needs to be avoided. */ + int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1); + + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = dacc2 = dacc3 = dacc4 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc3, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc4, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51, dvecInData61; + xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52, dvecInData62; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch2 - (4 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load all the 5x5 coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + + /* Rearrange them so that 4 MUL4T, 1 MULQ and 1 MUL can be used to perform entire operation */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData12, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData12, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc3, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc4, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData22, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData22, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc3, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc4, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData32, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData32, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc3, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc4, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData42, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, dvecInData42, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc3, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc4, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData52, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc2, dvecInData52, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc3, dvecInData62, dvecInData61, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc4, dvecInData62, dvecInData62, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc3, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc4, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Multiply and accumulate the final coefficient for all the outputs */ + MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, IVP_SEL2NX8I(dvecInData52, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc3, IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc4, IVP_SEL2NX8I(dvecInData62, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + + /* 1st row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \ + (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + + /* To handle cases where the remaining output width is less than or equal to 60. + * loop across output height is unrolled twice. Loop across kernel width and height is + * completely unrolled. 64 bytes of input are loaded. + */ + if (x < outW) + { + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5, dvecInData6; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch1 * enable2ndRow); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData6, vaInData, pdvecIn, inDataPitch2 - (4 + enable2ndRow) * inDataPitch1); + + /* load all the 5x5 coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + + /* Rearrange them so that 4 MUL4T, 1 MULQ and 1 MUL can be used to perform entire operation */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData1, dvecInData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData2, dvecInData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData3, dvecInData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData4, dvecInData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc1, dvecInData5, dvecInData5, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc2, dvecInData6, dvecInData6, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc1, IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData1, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc2, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData4, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData3, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData2, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Multiply and accumulate the final coefficient for all the outputs */ + MORPH_OP_MULA(dacc1, IVP_SEL2NX8I(dvecInData5, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, IVP_SEL2NX8I(dvecInData6, dvecInData6, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTR2NX8(dvecCoeffData1, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth60, outW - x); + + /* Storing the first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* first depth , 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of if( x < outW)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* 5x5 MOW WHD Stride 2 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + MORPH_IDT_2Nx8* restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecCoeff3; + xb_vec2Nx8* restrict pdvecCoeff4; + + /* Variable Declarations */ + int32_t outCh, x, y; + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* Since there are 25 coefficients for 1 output channel, we can make use of 6 quad multipliers + * for generating 1 output. So we need to re-arrange the 25 coefficients in the pattern shown + * Pattern : 0 1 2 3 5 6 7 8 10 11 12 13 15 16 17 18 20 21 22 23 4 9 14 19 24 */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the output vector gives the next output row. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output heights */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming for coeff load */ + /* Coeff for 1st output channel */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + /* Coeff for 2nd output channel */ + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + /* Coeff for 3rd output channel */ + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + /* Coeff for 4th output channel */ + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + + + /* Input vector pointer initialization */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, \ + dvecInData5, dvecInData6, dvecInData7; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55; + +/**************************************** 1st inCh *********************************************/ + /* load data from 1st input row */ + valign vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 2nd input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 3rd input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 4th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 5th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1 * enable2Row); + + /* load data from 6th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1 * enable2Row); + + /* load data from 7th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, \ + inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1); + + /* load all the 5x5 coefficients for 4 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + /* Rearrange them so that max no. of qual multipliers can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is + * 64,65,66,67.........126,127, Data should be arranged as + * + * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126 + * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127 + * dvecData3 : 3, 4, 6,...60,62,0 ,66,68,70,...124,126,0 + * dvecData4 : 4, 6, 8,...61,63,0 ,67,69,71,...125,127,0 + * dvecData5 : 5, 7, 9,...62,0 ,0 ,68,70,72,...126,0 ,0 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* dvecData5 is kept separately and is used by quad multiplier finally */ + dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 4)); + + + /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/ + MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + MORPH_OP_MULQA(dacc4, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 5)); + + + /* Multiply and acc last coefficient(24) with the last row from 2 output channels */ + MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24)); + MORPH_OP_MULA(dacc4, dvecData55, IVP_EXTR2NX8(dvecCoeffData4, 24)); + +/**************************************** 2nd inCh *********************************************/ + /* load data from 1st input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 2nd input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 3rd input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 4th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1); + + /* load data from 5th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1 * enable2Row); + + /* load data from 6th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1 * enable2Row); + + /* load data from 7th input row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, \ + inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1); + + /* load all the 5x5 coefficients for 4 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + /* Rearrange them so that max no. of qual multipliers can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* dvecData5 is kept separately and is used by quad multiplier finally */ + dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 4)); + + + /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/ + MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + MORPH_OP_MULQA(dacc4, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 5)); + + + /* Multiply and acc last coefficient(24) with the last row from 2 output channels */ + MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24)); + MORPH_OP_MULA(dacc4, dvecData55, IVP_EXTR2NX8(dvecCoeffData4, 24)); + +/**************************************** 3rd inCh *********************************************/ + /* load data from 1st input row */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData1, vaInData2, pdvecIn2, inDataPitch1); + + /* load data from 2nd input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData2, vaInData2, pdvecIn2, inDataPitch1); + + /* load data from 3rd input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData3, vaInData2, pdvecIn2, inDataPitch1); + + /* load data from 4th input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData4, vaInData2, pdvecIn2, inDataPitch1); + + /* load data from 5th input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData5, vaInData2, pdvecIn2, inDataPitch1 * enable2Row); + + /* load data from 6th input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData6, vaInData2, pdvecIn2, inDataPitch1 * enable2Row); + + /* load data from 7th input row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData7, vaInData2, pdvecIn2, \ + inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1); + + /* load all the 5x5 coefficients for 4 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + /* Rearrange them so that max no. of qual multipliers can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* dvecData5 is kept separately and is used by quad multiplier finally */ + dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 4)); + + + /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/ + MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + MORPH_OP_MULQA(dacc4, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 5)); + + + /* Multiply and acc last coefficient(24) with the last row from 2 output channels */ + MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24)); + MORPH_OP_MULA(dacc4, dvecData55, IVP_EXTR2NX8(dvecCoeffData4, 24)); + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh ], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh ], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the fourth output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable4thCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable2ndCh * enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * \ + enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable3rdCh * enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * \ + enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the fourth output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut4L, dvecOut4L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable4thCh * enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, enable4thCh * \ + enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 4) */ + } /* end of for (y = 0; y < outH; y += 2) */ + } /* end of for (x = 0; x < outW; x += vectorizationWidth) */ +} +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j2d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_5x5j2d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_5x5j2d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_5x5j2d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_5x5j2d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 2); + XAI_CHECK_STRIDE(param, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_5x5j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecCoeff3; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* Since there are 25 coefficients for 1 output channel, we can make use of 6 quad multipliers + * for generating 1 output. So we need to re-arrange the 25 coefficients in the pattern shown + * Pattern : 0 1 2 3 5 6 7 8 10 11 12 13 15 16 17 18 20 21 22 23 4 9 14 19 24 */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* loop across output depth is unrolled by 3 + * , producing lanes from 3 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the output vector gives the next output row. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output heights */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + + /* priming of coeff load is done outside the innermost loop*/ + /* Coeff for 1st output channel */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + /* Coeff for 2nd output channel */ + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + /* Coeff for 3rd output channel */ + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + /* Input vector pointer initialization */ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, \ + dvecInData5, dvecInData6, dvecInData7; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55; + + /* load data from 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn1, inDataPitch1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn1, inDataPitch1 * enable2Row); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData6, vaInData, pdvecIn1, inDataPitch1 * enable2Row); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData7, vaInData, pdvecIn1, \ + inDataPitch2 - (4 + 2 * enable2Row) * inDataPitch1); + + /* load all the 5x5 coefficients for 2 output depths*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + + /* Rearrange them so that max no. of qual multipliers can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is + * 64,65,66,67.........126,127, Data should be arranged as + * + * dvecData1 : 0, 2, 4,...58,60,62,64,66,68,...122,124,126 + * dvecData2 : 1, 3, 5,...59,61,63,65,67,69,...123,125,127 + * dvecData3 : 3, 4, 6,...60,62,0 ,66,68,70,...124,126,0 + * dvecData4 : 4, 6, 8,...61,63,0 ,67,69,71,...125,127,0 + * dvecData5 : 5, 7, 9,...62,0 ,0 ,68,70,72,...126,0 ,0 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvecData3 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* dvecData5 is kept separately and is used by quad multiplier finally */ + dvecData51 = IVP_SEL2NX8I(dvecData1, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_2); + + /* Values corresponding to first and second row are packed in one register + so that same coefficient will get multiplied to them */ + /* Multiply and accumulate 4 coefficients from 1st set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData52, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 2nd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData53, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 3rd set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData54, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 4th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + + + IVP_DSEL2NX8I(dvecData2, dvecData1, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData55, dvecData3, dvecData1, dvecData1, IVP_DSELI_8B_ROTATE_RIGHT_2_1); + dvecData4 = IVP_SEL2NX8I(dvecData2, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* Multiply and accumulate 4 coefficients from 5th set of 5 coeff for all the outputs */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + + + /* Multiply and acc last coefficient from 1st 4 sets of coeffs for all the outputs*/ + MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16( \ + IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + + + /* Multiply and acc last coefficient(24) with the last row from 2 output channels */ + MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++) */ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh ], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + int32_t varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable3rdCh * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable2ndCh * enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * \ + enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the third output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut3L, dvecOut3L, IVP_SELI_EXTRACT_HI_HALVES), + vaOutData, pdvecOut, enable3rdCh * enable2Row * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, enable3rdCh * \ + enable2Row * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 3 * outDataPitch2 * bytesPerPixel; + pCoeff += 3 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh += 3) */ + } /* end of for (y = 0; y < outH; y += 2) */ + } /* end of for (x = 0; x < outW; x += vectorizationWidth) */ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j4d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_5x5j4d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_5x5j4d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_5x5j4d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_5x5j4d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 2); + XAI_CHECK_STRIDE(param, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, * restrict pdvecCoeff3; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 + * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data, + * So that last coeff from first 4 rows of coeffs can be used as one + * 32 byte element and make use of quad multiplier outside the inner- + * most loop. + * c11, c12, c13, c14, c15 + * c21, c22, c23, c24, c25 + * c31, c32, c33, c34, c35 + * c41, c42, c43, c44, c45 + * c51, c52, c53, c54, c55 + * + * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that + * c15, c25, c35, and c45 can be used as one 32 byte element + */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \ + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* loop across output depth is unrolled by 3 + * , producing lanes from 3 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the output vector gives the next output row. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH - 1; y += 2) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff & bias data pointer to outCh kernel */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + + /* load coeff for all the 4 outptu channels*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + + /* shuffles the loaded coeff put them in proper order */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + + /* loads 1st input row */ + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 3rd input row */ + MORPH_IDT_2Nx8 dvecInData31, dvecInData32; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 4th input row */ + MORPH_IDT_2Nx8 dvecInData41, dvecInData42; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 5th input row */ + MORPH_IDT_2Nx8 dvecInData51, dvecInData52; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 6th input row */ + MORPH_IDT_2Nx8 dvecInData61, dvecInData62; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 7th input row */ + MORPH_IDT_2Nx8 dvecInData71, dvecInData72; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 8th input row */ + MORPH_IDT_2Nx8 dvecInData81, dvecInData82; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 9th input row */ + MORPH_IDT_2Nx8 dvecInData91, dvecInData92; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData91, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData92, vaInData, pdvecIn, \ + inDataPitch2 - 8 * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * dvecData5 : 4, 8,11,...124,0 ,132,136,140,...252,0 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55; + + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* dvecData5 is kept separately and is used by quad multiplier finally */ + dvecData51 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from all three output channels and accumulate. Lower + * half of the accumulators contain data corresponding to the first + * output row and upper half contains next output row */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + /* Calculations for second row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData52 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + /* Calculations for third row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData53 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + + /* Calculations for fourth row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData54 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + + /* Calculations for fifth row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData92, dvecInData91, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData92, dvecInData91, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData55 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + + /* multiplies last coeffs of 1st four rows with the input data */ + MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + + /* multiplies last coeff(24th) with the input data */ + MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + int32_t varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the upper half of the output vectors + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 3 * outDataPitch2 * bytesPerPixel; + pCoeff += 3 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 3)*/ + } /* end of for (y = 0; y < outH - 1; y += 2)*/ + if (y < outH) + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff & bias data pointer to outCh kernel */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 3) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + + /* load coeff for all the 4 outptu channels*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + + /* shuffles the loaded coeff put them in proper order */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + + /* loads 1st input row */ + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 3rd input row */ + MORPH_IDT_2Nx8 dvecInData31, dvecInData32; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 4th input row */ + MORPH_IDT_2Nx8 dvecInData41, dvecInData42; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 5th input row */ + MORPH_IDT_2Nx8 dvecInData51, dvecInData52; + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, inDataPitch2 - 4 * inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, + * + * dvecData1 : 0, 4, 8,...120,124 + * dvecData2 : 1, 5, 9,...121,125 + * dvecData3 : 2, 6,10,...122,126 + * dvecData4 : 3, 7,11,...123,127 + * dvecData5 : 4, 8,11,...124,0 + * + */ + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecData51, dvecData52, dvecData53, dvecData54, dvecData55; + + IVP_DSEL2NX8I(dvecData2, dvecData1, + 0, + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + 0, + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* dvecData5 is kept separately and is used by quad multiplier finally */ + dvecData51 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* multiplies data from input row with coeff from + * all three output channels and accumulate. */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + /* Calculations for second row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + 0, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + 0, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData52 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + /* Calculations for third row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + 0, + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + 0, + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData53 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + + /* Calculations for fourth row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + 0, + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + 0, + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData54 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + + /* Calculations for fifth row */ + IVP_DSEL2NX8I(dvecData2, dvecData1, + 0, + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + 0, + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData55 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 4)); + + /* multiplies last coeffs of 1st four rows with the input data */ + MORPH_OP_MULQA(dacc1, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + MORPH_OP_MULQA(dacc2, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + MORPH_OP_MULQA(dacc3, dvecData54, dvecData53, dvecData52, dvecData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 5)); + + /* multiplies last coeff(24th) with the input data */ + MORPH_OP_MULA(dacc1, dvecData55, IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc2, dvecData55, IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc3, dvecData55, IVP_EXTR2NX8(dvecCoeffData3, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + int32_t varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 3 * outDataPitch2 * bytesPerPixel; + pCoeff += 3 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 3)*/ + } /* end of if(y < outH)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j1d2I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution */ +/* with dilation = 2 */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_5x5j1d2_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_5x5j1d2_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_5x5j1d2_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_5x5j1d2_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_TILE3D_EDGE(inTile, 4); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 2); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1; + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1; + + /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 + * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data, + * So that last coeff from first 4 rows of coeffs can be used as one + * 32 bit element and make use of quad multiplier outside the inner- + * most loop. + * c11, c12, c13, c14, c15 + * c21, c22, c23, c24, c25 + * c31, c32, c33, c34, c35 + * c41, c42, c43, c44, c45 + * c51, c52, c53, c54, c55 + * + * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that + * c15, c25, c35, and c45 can be used as one 32 bit element + */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \ + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* loop across output height is unrolled twice and loop across kernel width and height is + completely unrolled*/ + + /* 0 1 2 3 .. 62 63*/ + xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8(); + /* 64 65 66 ...126 127*/ + xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 64); + + if (!typeFlag) + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_INTERLEAVE_1); + } + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + int32_t remX = XT_MIN(vectorizationWidth, outW - x); + + /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH, + * i.e. if the number of input data bytes corresponding to remX number of outputs + * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load + * the next 64 input bytes*/ + int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0; + + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51; + xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, inDataPitch2 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 4 * inDataPitch1); + + /* load all the 5x5 coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* Rearrange them so that 5 MUL4T,1 MULQ,1 MUL can be used to perform entire operation */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + MORPH_OP_MUL4TA(dacc21, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc11, \ + IVP_SEL2NX8I(dvecInData41, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData31, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData11, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc12, \ + IVP_SEL2NX8I(dvecInData42, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc21, \ + IVP_SEL2NX8I(dvecInData41, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData31, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData11, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + MORPH_OP_MULQA(dacc22, \ + IVP_SEL2NX8I(dvecInData42, IVP_SEL2NX8I(dvecInData42, dvecInData41, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, IVP_SEL2NX8I(dvecInData32, dvecInData31, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, IVP_SEL2NX8I(dvecInData12, dvecInData11, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* Multiply and accumulate the final coefficient for all the outputs */ + MORPH_OP_MULA(dacc11, IVP_SEL2NX8I(dvecInData51, \ + IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc12, IVP_SEL2NX8I(dvecInData52, \ + IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTR2NX8(dvecCoeffData1, 24)); + + MORPH_OP_MULA(dacc21, IVP_SEL2NX8I(dvecInData51, \ + IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc22, IVP_SEL2NX8I(dvecInData52, \ + IVP_SEL2NX8I(dvecInData52, dvecInData51, \ + IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1), IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTR2NX8(dvecCoeffData2, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L; + xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1); + xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2); + xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1); + xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * remX) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_5x5j1d4I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 5x5 3D convolution */ +/* with dilation = 4 */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 5x5xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_5x5j1d4_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_5x5j1d4_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_5x5j1d4_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_5x5j1d4_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_5x5j1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 5); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_TILE3D_EDGE(inTile, 8); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 4); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1; + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1; + + /* generates the sequence 0,1,2,3 ,5,6,7,8 ,10,11,12,13 ,15,16,17,18 + * , 20,21,22,23 ,4,9,14,19 ,24. To be used to shuffle the coeff data, + * So that last coeff from first 4 rows of coeffs can be used as one + * 32 bit element and make use of quad multiplier outside the inner- + * most loop. + * c11, c12, c13, c14, c15 + * c21, c22, c23, c24, c25 + * c31, c32, c33, c34, c35 + * c41, c42, c43, c44, c45 + * c51, c52, c53, c54, c55 + * + * c15, c25, c35, c45 and c55 are placed in contigous fashion, so that + * c15, c25, c35, and c45 can be used as one 32 bit element + */ + xb_vec2Nx8 dvecIdx; + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 15), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 10), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 5), IVP_SEQ2NX8(), + IVP_SELI_INTERLEAVE_2_LO), + IVP_SELI_INTERLEAVE_4_LO); + + dvecIdx = IVP_SEL2NX8I(IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16( + IVP_MOVNX16_FROMN_2X32(4 + (9 << 8) + (14 << 16) + \ + (19 << 24))), + IVP_ADD2NX8U(IVP_SEQ2NX8(), 20), IVP_SELI_INTERLEAVE_2_LO), + dvecIdx, IVP_SELI_8B_PACK_16); + + /* loop across output height is unrolled twice and loop across kernel width and height is + completely unrolled*/ + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + int32_t remX = XT_MIN(vectorizationWidth, outW - x); + + /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH, + * i.e. if the number of input data bytes corresponding to remX number of outputs + * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load + * the next 64 input bytes*/ + int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0; + + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* Load the bias values corresponding to two output channels */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51; + xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, inDataPitch2 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 4 * inDataPitch1); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load all the 5x5 coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* Rearrange them so that 5 MUL4T,1 MULQ,1 MUL can be used to perform entire operation */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + + /* Multiply and accumulate 1st set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* Multiply and accumulate 3rd set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + MORPH_OP_MUL4TA(dacc21, 0, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* Multiply and accumulate 4th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + + MORPH_OP_MUL4TA(dacc21, 0, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + + /* Multiply and accumulate 5th set of 4 coefficients for all the outputs */ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + + MORPH_OP_MUL4TA(dacc21, 0, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + + /* Multiply and accumulate 6th set of 4 coefficients for all the outputs */ + MORPH_OP_MULQA(dacc11, \ + IVP_SEL2NX8I(dvecInData41, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData31, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc12, \ + IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + MORPH_OP_MULQA(dacc21, \ + IVP_SEL2NX8I(dvecInData41, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData31, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + MORPH_OP_MULQA(dacc22, \ + IVP_SEL2NX8I(dvecInData42, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* Multiply and accumulate the final coefficient for all the outputs */ + MORPH_OP_MULA(dacc11, IVP_SEL2NX8I(dvecInData51, dvecInData51, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData1, 24)); + MORPH_OP_MULA(dacc12, IVP_SEL2NX8I(dvecInData52, dvecInData52, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData1, 24)); + + MORPH_OP_MULA(dacc21, IVP_SEL2NX8I(dvecInData51, dvecInData51, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData2, 24)); + MORPH_OP_MULA(dacc22, IVP_SEL2NX8I(dvecInData52, dvecInData52, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTR2NX8(dvecCoeffData2, 24)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + if (!typeFlag) + { + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else + { + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \ + IVP_DSELI_INTERLEAVE_2); + } + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remX * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * remX) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_7x7j1d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D dilated convolution function and 7x7 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************************************************************************************** +* 7x7 MOW WHD Stride 1 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t outCh, x, y; + int32_t varLen; + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* Vectorization width taken is 122. Two loads from same row + * Loop across output height is unrolled twice. + * Thus a single iteration produces 4 output vector. + * Input channels , kernel width and kernel height + * are completely unrolled. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* In order to handle cases where input width <= 64, where + * the 2nd load from the same row needs to be avoided. */ + varLen = XT_MIN(vectorizationWidth, outW - x); + int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + dacc11 = dacc12 = dacc21 = dacc22 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc21, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc22, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + /* 2 4-tap multipliers are used to accumulate 1 wide vector + * first 4-tap multiplier makes use of first 4 coeff across + * the kernel width. next 4 tap mulplier makes use last 3 + * coeff across the kernel width, and 4th byte is zero + */ + + MORPH_IDT_2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, \ + dvecInData51, dvecInData61, dvecInData71, dvecInData81; + + MORPH_IDT_2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, \ + dvecInData52, dvecInData62, dvecInData72, dvecInData82; + + /** Input Channel 1 **/ + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 8th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn, \ + inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load first row of the coeff */ + xb_vec2Nx8 dvecCoeffData1; + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with first coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 2nd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 3rd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 4th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 5th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 6th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + /* Multiply input vectors with 7th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /** Input Channel 2 **/ + /* load data from 1st input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 8th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn, \ + inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load first row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with first coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 2nd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 3rd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 4th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 5th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 6th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + /* Multiply input vectors with 7th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /** Input Channel 3 **/ + /*load data from 1st input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 8th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn, \ + inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load first row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with first coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 2nd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 3rd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 4th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 5th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 6th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + /* Multiply input vectors with 7th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 1st row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \ + (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ +} + +/****************************************************************************************** +* 7x7 MOW fold 32 Stride 1 * +* If inDataPitch1 is lesser than or equal to * +* 32 this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_FOLD32) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + + /* Variable Declarations */ + int32_t inCh, outCh, y; + + /* Select sequence to re-arrange input data */ + xb_vec2Nx8 dvecSeq = 0; + IVP_ADD2NX8T(dvecSeq, IVP_SEQ2NX8(), inDataPitch1, IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1)); + IVP_ADD2NX8T(dvecSeq, IVP_SUB2NX8(IVP_SEQ2NX8(), inDataPitch1), 64, \ + IVP_NOTB2N(IVP_LT2NX8(IVP_SEQ2NX8(), inDataPitch1))); + + /* loop across output height is unrolled twice + * to produce two output vectors in 1 iteration + */ + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y]; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input depth */ + { + /* 2 4-tap multipliers are used to accumulate 1 wide vector + * first 4-tap multiplier makes use of first 4 coeff across + * the kernel width. next 4 tap mulplier makes use last 3 + * coeff across the kernel width, and 4th byte is zero + */ + + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4; + + /* load data from first 2 input rows */ + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, 2 * inDataPitch1); + + /* load data from next 2 input rows */ + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, 2 * inDataPitch1); + + /* load data from next 2 input rows */ + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, 2 * inDataPitch1); + + /* load data from next 2 input rows */ + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecInData4, vaInData, pdvecIn, (1 + enable2ndRow) * inDataPitch1); + pInput += inDataPitch2; + + /* load first row of the coeff */ + xb_vec2Nx8 dvecCoeffData1; + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + xb_vec2Nx8 dvecTemp1, dvecTemp2, dvecTemp3; + dvecTemp1 = IVP_SEL2NX8(dvecInData2, dvecInData1, dvecSeq); + dvecTemp2 = IVP_SEL2NX8(dvecInData3, dvecInData2, dvecSeq); + dvecTemp3 = IVP_SEL2NX8(dvecInData4, dvecInData3, dvecSeq); + + /* Multiply input vectors with first coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + + /* load 2nd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 2nd coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecTemp1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecTemp1, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + + /* load 3rdrow of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 3rd coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData2, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + + /* load 4th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 4th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecTemp2, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecTemp2, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 5th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData3, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + + /* load 6th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 6th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecTemp3, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecTemp3, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + + + /* load 7th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 7th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData4, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L; + xb_vec2Nx8 dvecOut1H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output depth, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8(dvecOut1H, dvecOut1L, IVP_ADD2NX8(IVP_SEQ2NX8(), inDataPitch1 * bytesPerPixel)), \ + vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * outW); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ +} + +/****************** xaiConvolvedVQ3D_S_7x7j1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_7x7j1d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_7x7j1d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_7x7j1d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 3); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Call DEPTH3 varinat if input depth =3 */ + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + /* check inDataPitch1, if it is less than or equal to 32, + * call FOLD32 variant + */ + if (XAI_TILE3D_GET_DIM1_PITCH(inTile) <= 32) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j1d1), S8IX_MOW_WHD_FOLD32) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + + /* Pitches of Coefficient Data (WHDN) in dim1 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + int32_t varLen; + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth58 = ((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + const int32_t vectorizationWidth122 = ((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) + 1; + + /* loop across output height is unrolled twice. + * Loop across kernel width and height is + * completely unrolled. + * 128 bytes of input are loaded. + */ + for (x = 0; x < outW - vectorizationWidth58; x += vectorizationWidth122) /* Loop across Output width */ + { + varLen = XT_MIN(vectorizationWidth122, outW - x); + int32_t enable2ndCol = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, varLen + kSizeU - 1); + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + dacc11 = dacc12 = dacc21 = dacc22 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc21, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc22, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input depth */ + { + /* 2 4-tap multipliers are used to accumulate 1 wide vector + * first 4-tap multiplier makes use of first 4 coeff across + * the kernel width. next 4 tap mulplier makes use last 3 + * coeff across the kernel width, and 4th byte is zero + */ + + MORPH_IDT_2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, dvecInData51, \ + dvecInData61, dvecInData71, dvecInData81; + + MORPH_IDT_2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, dvecInData52, \ + dvecInData62, dvecInData72, dvecInData82; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, \ + inDataPitch1 - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, \ + inDataPitch1 * enable2ndRow \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load data from 8th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData81, vaInData, pdvecIn, \ + enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData82, vaInData, pdvecIn, \ + inDataPitch2 - (6 + enable2ndRow) * inDataPitch1 \ + - enable2ndCol * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* load first row of the coeff */ + xb_vec2Nx8 dvecCoeffData1; + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with first coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 2nd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 3rd coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData32, dvecInData31, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData32, dvecInData31, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData32, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData32, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 4th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData42, dvecInData41, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData42, dvecInData41, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData42, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData42, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 5th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData52, dvecInData51, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData52, dvecInData51, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData52, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData52, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + + MORPH_OP_MUL4TA(dacc21, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 6th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData62, dvecInData61, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData62, dvecInData61, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData62, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData62, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + /* Multiply input vectors with 7th coeff row */ + MORPH_OP_MUL4TA(dacc11, dvecInData72, dvecInData71, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData72, dvecInData71, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc12, 0, dvecInData72, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, IVP_SEL2NX8I(0, dvecInData72, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData82, dvecInData81, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc21, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_SEL2NX8I(dvecInData82, dvecInData81, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc22, 0, dvecInData82, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc22, 0, IVP_SEL2NX8I(0, dvecInData82, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 1st row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * \ + (varLen - 2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - 3 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + + + /* To handle cases where the remaining output width is less than or equal to 58. + * loop across output height is unrolled twice. Loop across kernel width and height is + * completely unrolled. 64 bytes of input are loaded. + */ + if (x < outW) + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input depth */ + { + /* 2 4-tap multipliers are used to accumulate 1 wide vector + * first 4-tap multiplier makes use of first 4 coeff across + * the kernel width. next 4 tap mulplier makes use last 3 + * coeff across the kernel width, and 4th byte is zero + */ + + xb_vec2Nx8 dvecInData1, dvecInData2, dvecInData3, dvecInData4, dvecInData5, \ + dvecInData6, dvecInData7, dvecInData8; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData1, vaInData, pdvecIn, inDataPitch1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData2, vaInData, pdvecIn, inDataPitch1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData3, vaInData, pdvecIn, inDataPitch1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData4, vaInData, pdvecIn, inDataPitch1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData5, vaInData, pdvecIn, inDataPitch1); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData6, vaInData, pdvecIn, inDataPitch1); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData7, vaInData, pdvecIn, inDataPitch1 * enable2ndRow); + + /* load data from 8th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData8, vaInData, pdvecIn, inDataPitch2 - (6 + enable2ndRow) * inDataPitch1); + + /* load first row of the coeff */ + xb_vec2Nx8 dvecCoeffData1; + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with first coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData1, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData2, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 2nd coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData2, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData2, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData3, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rdrow of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 3rd coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData3, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData3, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData4, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 4th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData4, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData4, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 5th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData5, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData6, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData6, IVP_SELI_8B_ROTATE_RIGHT_4), + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 6th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData6, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData6, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData7, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData7, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of the coeff */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply input vectors with 7th coeff row */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData7, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(0, dvecInData7, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MUL4TA(dacc2, 0, dvecInData8, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(0, dvecInData8, IVP_SELI_8B_ROTATE_RIGHT_4), \ + IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth58, outW - x); + + /* Storing the first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* first depth , 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * enable2ndRow * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of if(x < outW)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* 7x7 MOW WHD Stride 2 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + MORPH_IDT_2Nx8* restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t outCh, x, y; + int32_t varLen; + /* No. of output elements that can be processed from 2 input loads */ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* generates the sequence to shuffle the coeff */ + /* 0 1 2 3 4 5 6 7 7 8 9 10 11 12 13 14 8 9 10 11 12 13 14 15 .. */ + xb_vec2Nx8 dvecShflIdx = IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 7), \ + IVP_SEQ2NX8(), IVP_SELI_8B_INTERLEAVE_8_LO); + /* 0 2 4 6 8 10 12 14 14 16 18 20 22 24 26 28 28 30 32 34 36 38 40 42 42 44 46 48 50 52 ... */ + dvecShflIdx = IVP_SEL2NX8I(IVP_SLLI2NX8(IVP_ADD2NX8U(dvecShflIdx, 14), 1), \ + IVP_SLLI2NX8(dvecShflIdx, 1), IVP_SELI_8B_PACK_16); + /* Assuming that 50th index will have zero values */ + /* Final shuffle index pattern will be + 0 2 4 6 1 3 5 50 8 10 12 50 7 9 11 13 + 14 16 18 20 15 17 19 50 22 24 26 50 21 23 25 27 + 28 30 32 34 29 31 33 50 36 38 40 50 35 37 39 41 + 42 44 46 48 43 45 47 50 + */ + dvecShflIdx = IVP_SEL2NX8I( + IVP_MOV2NX8T(50, IVP_ADD2NX8(dvecShflIdx, IVP_SEL2NX8I(-1, 1, IVP_SELI_8B_INTERLEAVE_4_LO)), + IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 3)), + IVP_MOV2NX8T(50, dvecShflIdx, IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 7)), + IVP_SELI_8B_INTERLEAVE_4_LO); + + /* The inner most loop runs across the kernel height and produces + * 4 output vectors - 2 output rows from 2 output channels. Unrolling across the output + * channels by 2 helps in re-using the already loaded input data. Unrolling across the + * output height by 2 helps in re-using the already loaded coeff data. + * The coefficients are arranged in such a way that MORPH_OP_MUL4TA can be used. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across output height */ + { + /* In order to handle odd output heights */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff data pointer and bias data pointer to outCh kernel */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* Initialize the acc for 1st and 2nd output row of 1st channel with bias data */ + xb_vec2Nx24 dacc1; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + /* Initialize the acc for 1st and 2nd output row of 2nd channel with bias data */ + xb_vec2Nx24 dacc2; + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming for coeff loads */ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + xb_vec2Nx8 dvecI0, dvecI1; + /******************************** 1st inCh **************************************/ + /* Load vectors from first row */ + valign vaInData1; vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData1; IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 2nd row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData2; IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 3rd row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData3; IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 4th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData4; IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 5th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData5; IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 6th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData6; IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 7th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData7; IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, \ + inDataPitch1 * enable2Row); + + /* Load vectors from 8th row, to be used by 2nd output height */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData8; IVP_LA2NX8_XP(dvecInData8, vaInData1, pdvecIn1, \ + inDataPitch1 * enable2Row); + + /* Load vectors from 9th row, to be used by 2nd output height */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + xb_vec2Nx8 dvecInData9; IVP_LA2NX8_XP(dvecInData9, vaInData1, pdvecIn1, \ + inDataPitch2 - (6 + 2 * enable2Row) * inDataPitch1); + + /* Load the 7x7 coefficients for 2 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* rearrange the coeff in desired format, so that MUL4T can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx); + + /* Re-arrange the data in the desired format */ + /* Assume input as 0,1,2, .. 63 for two rows */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecI0 : 0, 2, 4, ... */ + /* dvecI1 : 1, 3, 5, ... */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 1st row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Mulitply 1st row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 2nd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Mulitply 2nd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 3rd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Mulitply 3rd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 4th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + /* Mulitply 4th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 5th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + + /* Mulitply 5th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 6th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10)); + + /* Mulitply 6th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 7th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13)); + + /* Mulitply 7th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13)); + + /***************************** 2nd inCh **********************************************/ + /* Load vectors from first row */ + + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData1, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 2nd row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData2, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 3rd row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData3, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 4th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData4, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 5th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData5, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 6th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData6, vaInData1, pdvecIn1, inDataPitch1); + + /* Load vectors from 7th row */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData7, vaInData1, pdvecIn1, inDataPitch1 * enable2Row); + + /* Load vectors from 8th row, to be used by 2nd output height */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData8, vaInData1, pdvecIn1, inDataPitch1 * enable2Row); + + /* Load vectors from 9th row, to be used by 2nd output height */ + vaInData1 = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_IP(dvecInData9, vaInData1, pdvecIn1); + + /* Load the 7x7 coefficients for 2 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* rearrange the coeff in desired format, so that MUL4T can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx); + + /* Re-arrange the data in the desired format */ + /* Assume input as 0,1,2, .. 63 for two rows */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecI0 : 0, 2, 4, ... */ + /* dvecI1 : 1, 3, 5, ... */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 1st row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Mulitply 1st row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 2nd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Mulitply 2nd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 3rd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Mulitply 3rd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 4th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + /* Mulitply 4th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 5th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + + /* Mulitply 5th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 6th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10)); + + /* Mulitply 6th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 7th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13)); + + /* Mulitply 7th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13)); + + /******************************* 3rd inCh *********************************************/ + /* Load vectors from first row */ + valign vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData1, vaInData2, pdvecIn2, inDataPitch1); + + /* Load vectors from 2nd row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData2, vaInData2, pdvecIn2, inDataPitch1); + + /* Load vectors from 3rd row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData3, vaInData2, pdvecIn2, inDataPitch1); + + /* Load vectors from 4th row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData4, vaInData2, pdvecIn2, inDataPitch1); + + /* Load vectors from 5th row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData5, vaInData2, pdvecIn2, inDataPitch1); + + /* Load vectors from 6th row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData6, vaInData2, pdvecIn2, inDataPitch1); + + /* Load vectors from 7th row */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData7, vaInData2, pdvecIn2, inDataPitch1 * enable2Row); + + /* Load vectors from 8th row, to be used by 2nd output height */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData8, vaInData2, pdvecIn2, inDataPitch1 * enable2Row); + + /* Load vectors from 9th row, to be used by 2nd output height */ + vaInData2 = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_IP(dvecInData9, vaInData2, pdvecIn2); + + /* Load the 7x7 coefficients for 2 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* rearrange the coeff in desired format, so that MUL4T can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx); + + /* Re-arrange the data in the desired format */ + /* Assume input as 0,1,2, .. 63 for two rows */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecI0 : 0, 2, 4, ... */ + /* dvecI1 : 1, 3, 5, ... */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 1st row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Mulitply 1st row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 2nd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Mulitply 2nd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 3rd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Mulitply 3rd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 4th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + /* Mulitply 4th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 5th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + + /* Mulitply 5th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 6th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10)); + + /* Mulitply 6th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 7th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13)); + + /* Mulitply 7th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13)); + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh ], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the 1st row output from 1st channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row output from 1st channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \ + pdvecOut, (-typeFlag + 1) * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 1st row output from 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, enable2ndCh * bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row output from 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch1 * enable2Row + \ + outDataPitch2 * enable2ndCh) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \ + pdvecOut, enable2ndCh * (-typeFlag + 1) * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * typeFlag * 2 * varLen * \ + enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* END for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* END for (y = 0; y < outH ; y += 2)*/ + } /* END for (x = 0; x < outW; x += vectorizationWidth)*/ +} + + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_7x7j2d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D dilated convolution function and 7x7 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_7x7j2d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_7x7j2d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { +// MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 3); + XAI_CHECK_STRIDE(param, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j2d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + int32_t varLen; + /* No. of output elements that can be processed from 2 input loads */ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* generates the sequence to shuffle the coeff */ + /* 0 1 2 3 4 5 6 7 7 8 9 10 11 12 13 14 8 9 10 11 12 13 14 15 .. */ + xb_vec2Nx8 dvecShflIdx = IVP_SEL2NX8I(IVP_ADD2NX8U(IVP_SEQ2NX8(), 7), \ + IVP_SEQ2NX8(), IVP_SELI_8B_INTERLEAVE_8_LO); + /* 0 2 4 6 8 10 12 14 14 16 18 20 22 24 26 28 28 30 32 34 36 38 40 42 42 44 46 48 50 52 ... */ + dvecShflIdx = IVP_SEL2NX8I(IVP_SLLI2NX8(IVP_ADD2NX8U(dvecShflIdx, 14), 1), \ + IVP_SLLI2NX8(dvecShflIdx, 1), IVP_SELI_8B_PACK_16); + /* Assuming that 50th index will have zero values */ + /* Final shuffle index pattern will be + 0 2 4 6 1 3 5 50 8 10 12 50 7 9 11 13 + 14 16 18 20 15 17 19 50 22 24 26 50 21 23 25 27 + 28 30 32 34 29 31 33 50 36 38 40 50 35 37 39 41 + 42 44 46 48 43 45 47 50 + */ + dvecShflIdx = IVP_SEL2NX8I( + IVP_MOV2NX8T(50, IVP_ADD2NX8(dvecShflIdx, IVP_SEL2NX8I(-1, 1, IVP_SELI_8B_INTERLEAVE_4_LO)), + IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 3)), + IVP_MOV2NX8T(50, dvecShflIdx, IVP_EQ2NX8(IVP_AND2NX8(IVP_SEQ2NX8(), 7), 7)), + IVP_SELI_8B_INTERLEAVE_4_LO); + + /* The inner most loop runs across the kernel height and produces + * 4 output vectors - 2 output rows from 2 output channels. Unrolling across the output + * channels by 2 helps in re-using the already loaded input data. Unrolling across the + * output height by 2 helps in re-using the already loaded coeff data. + * The coefficients are arranged in such a way that MORPH_OP_MUL4TA can be used. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + for (y = 0; y < outH - 1; y += 2) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff data pointer and bias data pointer to outCh kernel */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* Initialize the acc for 1st and 2nd output row of 1st channel with bias data */ + xb_vec2Nx24 dacc1; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + /* Initialize the acc for 1st and 2nd output row of 2nd channel with bias data */ + xb_vec2Nx24 dacc2; + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + xb_vec2Nx8 dvecI0, dvecI1; + + /* Load vectors from first row */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData1; IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 2nd row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData2; IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 3rd row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData3; IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 4th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData4; IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 5th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData5; IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 6th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData6; IVP_LA2NX8_XP(dvecInData6, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 7th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData7; IVP_LA2NX8_XP(dvecInData7, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 8th row, to be used by 2nd output height */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData8; IVP_LA2NX8_XP(dvecInData8, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 9th row, to be used by 2nd output height */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData9; IVP_LA2NX8_XP(dvecInData9, vaInData, pdvecIn, \ + inDataPitch2 - 8 * inDataPitch1); + + /* Load the 7x7 coefficients for 2 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* rearrange the coeff in desired format, so that MUL4T can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx); + + /* Re-arrange the data in the desired format */ + /* Assume input as 0,1,2, .. 63 for two rows */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecI0 : 0, 2, 4, ... */ + /* dvecI1 : 1, 3, 5, ... */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 1st row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Mulitply 1st row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 2nd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Mulitply 2nd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 3rd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Mulitply 3rd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 4th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + /* Mulitply 4th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 5th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + + /* Mulitply 5th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData8, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 6th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10)); + + /* Mulitply 6th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData9, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 7th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13)); + + /* Mulitply 7th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13)); + } /* END for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the 1st row output from 1st channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row output from 1st channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut1L, dvecOut1L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \ + pdvecOut, (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 1st row output from 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, enable2ndCh * bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row output from 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch1 + \ + outDataPitch2 * enable2ndCh) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SEL2NX8I(dvecOut2L, dvecOut2L, IVP_SELI_EXTRACT_HI_HALVES), vaOutData, \ + pdvecOut, enable2ndCh * (-typeFlag + 1) * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, enable2ndCh * typeFlag * 2 * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* END for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* END for (y = 0; y < outH - 1; y += 2)*/ + if (y < outH) + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff data pointer and bias data pointer to outCh kernel */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across output channels */ + { + /* In order to handle odd output depths */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* Initialize the acc for 1st and 2nd output row of 1st channel with bias data */ + xb_vec2Nx24 dacc1; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + /* Initialize the acc for 1st and 2nd output row of 2nd channel with bias data */ + xb_vec2Nx24 dacc2; + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2; + xb_vec2Nx8 dvecI0, dvecI1; + + /* Load vectors from first row */ + valign vaInData; vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData1; IVP_LA2NX8_XP(dvecInData1, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 2nd row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData2; IVP_LA2NX8_XP(dvecInData2, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 3rd row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData3; IVP_LA2NX8_XP(dvecInData3, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 4th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData4; IVP_LA2NX8_XP(dvecInData4, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 5th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData5; IVP_LA2NX8_XP(dvecInData5, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 6th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData6; IVP_LA2NX8_XP(dvecInData6, vaInData, pdvecIn, inDataPitch1); + + /* Load vectors from 7th row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + xb_vec2Nx8 dvecInData7; IVP_LA2NX8_XP(dvecInData7, vaInData, pdvecIn, \ + inDataPitch2 - 6 * inDataPitch1); + + /* Load the 7x7 coefficients for 2 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + + /* rearrange the coeff in desired format, so that MUL4T can be used */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecShflIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecShflIdx); + + /* Re-arrange the data in the desired format */ + /* Assume input as 0,1,2, .. 63 for two rows */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecI0 : 0, 2, 4, ... */ + /* dvecI1 : 1, 3, 5, ... */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData3, dvecInData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 1st row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* Mulitply 1st row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData4, dvecInData2, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 2nd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* Mulitply 2nd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData5, dvecInData3, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 3rd row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 4)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 5)); + + /* Mulitply 3rd row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 4)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 5)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData6, dvecInData4, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 4th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 7)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 6)); + + /* Mulitply 4th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 7)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 6)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, dvecInData7, dvecInData5, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* Mulitply 5th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + + /* Mulitply 5th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, 0, dvecInData6, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 6th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 11)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 10)); + + /* Mulitply 6th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 11)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 10)); + + /* rearrange input vectors */ + IVP_DSEL2NX8I(dvecI1, dvecI0, 0, dvecInData7, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* Mulitply 7th row with coeff from 1st output channel */ + MORPH_OP_MUL4TA(dacc1, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 12)); + MORPH_OP_MUL4TA(dacc1, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 13)); + + /* Mulitply 7th row with coeff from 2nd output channel */ + MORPH_OP_MUL4TA(dacc2, 0, dvecI0, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 12)); + MORPH_OP_MUL4TA(dacc2, 0, dvecI1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 13)); + } /* END for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the 1st row output from 1st channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 1st row output from 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, enable2ndCh * bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* END for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of if(y < outH) */ + } /* END for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* 7x7 MOW WHD Stride 4 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j4d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, *restrict pdvecCoeff2, *restrict pdvecCoeff3, \ + * restrict pdvecCoeff4; + + /* Variable Declarations */ + int32_t outCh, x, y, ky; + int32_t varLen; + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd heights*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + /*************************** 1st inCh ******************************/ + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4, dvecData5, dvecData6, dvecData7; + + /* load coeff for all the 4 output channels*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + for (ky = 0; ky < 7; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 5th(corresponding to the 2nd output row) input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow); + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + pInput += inDataPitch1; + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 1st output channel and accumulate */ + + /* IVP_EXTRVRN_2X32 extracts the required coeff from the + * coeff vector. In every iteration ky is updated therefore it + * extracts coeff from the next coeff row in the successive ky + * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to + * first four coeff in a row + */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 2nd output channel and accumulate */ + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 3rd output channel and accumulate */ + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 4th output channel and accumulate */ + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + (ky * coeffPitch1 + 4))); + } /* end of for (ky = 0; ky < 7; ky++)*/ + + /********************************* 2nd inCh ***************************************/ + + /* initialize input data pointer */ + pInput = &pInData[inDataPitch2 + inDataPitch1 * stride * y + stride * x]; + + /* load coeff for all the 4 output channels*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + for (ky = 0; ky < 7; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 5th(corresponding to the 2nd output row) input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow); + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + pInput += inDataPitch1; + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 1st output channel and accumulate */ + + /* IVP_EXTRVRN_2X32 extracts the required coeff from the + * coeff vector. In every iteration ky is updated therefore it + * extracts coeff from the next coeff row in the successive ky + * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to + * first four coeff in a row + */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 2nd output channel and accumulate */ + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 3rd output channel and accumulate */ + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 4th output channel and accumulate */ + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + (ky * coeffPitch1 + 4))); + } /* end of for (ky = 0; ky < 7; ky++)*/ + + /************************************ 3rd inCh *************************************/ + /* initialize input data pointer */ + pInput = &pInData[2 * inDataPitch2 + inDataPitch1 * stride * y + stride * x]; + + /* load coeff for all the 4 output channels*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + for (ky = 0; ky < 7; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 5th(corresponding to the 2nd output row) input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow); + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + pInput += inDataPitch1; + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 1st output channel and accumulate */ + + /* IVP_EXTRVRN_2X32 extracts the required coeff from the + * coeff vector. In every iteration ky is updated therefore it + * extracts coeff from the next coeff row in the successive ky + * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to + * first four coeff in a row + */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 2nd output channel and accumulate */ + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 3rd output channel and accumulate */ + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 4th output channel and accumulate */ + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + (ky * coeffPitch1 + 4))); + } /* end of for (ky = 0; ky < 7; ky++)*/ + + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_7x7j4d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D dilated convolution function and 7x7 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_7x7j4d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_7x7j4d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_7x7j4d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_7x7j4d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + MORPH_IDT_CHECK(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_EDGE(inTile, 3); + XAI_CHECK_STRIDE(param, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_7x7j4d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, * restrict pdvecCoeff3, \ + * restrict pdvecCoeff4; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + int32_t varLen; + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd heights*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inCh * inDataPitch2 + \ + inDataPitch1 * stride * y + stride * x]; + + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4, dvecData5, dvecData6, dvecData7; + + /* load coeff for all the 4 output channels*/ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch2); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch2); + + for (ky = 0; ky < 7; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput); + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 5th(corresponding to the 2nd output row) input row */ + pdvecIn = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow); + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + pInput += inDataPitch1; + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 1st output channel and accumulate */ + + /* IVP_EXTRVRN_2X32 extracts the required coeff from the + * coeff vector. In every iteration ky is updated therefore it + * extracts coeff from the next coeff row in the successive ky + * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to + * first four coeff in a row + */ + + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc1, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 2nd output channel and accumulate */ + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc2, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 3rd output channel and accumulate */ + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc3, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), \ + (ky * coeffPitch1 + 4))); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 4th output channel and accumulate */ + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + ky * coeffPitch1)); + MORPH_OP_MULQA(dacc4, 0, dvecData7, dvecData6, dvecData5, IVP_EXTRVRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), \ + (ky * coeffPitch1 + 4))); + } /* end of for (ky = 0; ky < 7; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_7x7j1d2I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution */ +/* with dilation = 2 */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 5x5 3D dilated convolution function and 5x5 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_7x7j1d2_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_7x7j1d2_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_7x7j1d2_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_7x7j1d2_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_TILE3D_EDGE(inTile, 6); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 2); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1; + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + /* 0 1 2 3 .. 62 63*/ + xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8(); + /* 64 65 66 67 ...126 127*/ + xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 64); + + if (!typeFlag) + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_INTERLEAVE_1); + } + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1; + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + int32_t remX = XT_MIN(vectorizationWidth, outW - x); + + /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH, + * i.e. if the number of input data bytes corresponding to remX number of outputs + * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load + * the next 64 input bytes*/ + int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0; + + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, \ + dvecInData51, dvecInData61, dvecInData71; + xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, \ + dvecInData52, dvecInData62, dvecInData72; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData62, dvecInData61, dvecInData62, dvecInData61, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, inDataPitch2 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 6 * inDataPitch1); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData72, dvecInData71, dvecInData72, dvecInData71, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 1st row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 1st row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData11, dvecInData11, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData12, dvecInData12, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 2nd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 2nd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData21, dvecInData21, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData22, dvecInData22, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rd row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 3rd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 3rd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData31, dvecInData31, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData32, dvecInData32, + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 4th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 4th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData41, dvecInData41, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData42, dvecInData42, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 5th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 5th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData51, dvecInData51, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData52, dvecInData52, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 6th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData61, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData62, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 6th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData61, dvecInData61, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData62, dvecInData62, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 7th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData71, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData72, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 7th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData71, dvecInData71, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData72, dvecInData72, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L; + xb_vec2Nx8 dvec1H, dvec2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + + /* Storing the first depth output , 1st row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_7x7j1d4I8MOW_WHD +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for 7x7 3D convolution */ +/* with dilation = 4 */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate 7x7 3D dilated convolution function and 7x7 */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is 7x7xDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_7x7j1d4_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_7x7j1d4_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_7x7j1d4_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_7x7j1d4_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_7x7j1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_KERNEL_SIZE(coeffTile, 7); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_TILE3D_EDGE(inTile, 12); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 4); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Pitches of Coefficient Data (WHDN) in dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN) */ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + int32_t dilatedkSizeU = dilationU * (kSizeU - 1) + 1; + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((dilatedkSizeU / 2) * inDataPitch1 + (dilatedkSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y; + + /* In order to make the loop multiply-bound we are reducing the vectorization width + by extra values required for the kernel */ + const int32_t vectorizationWidth = ((4 * XCHAL_IVPN_SIMD_WIDTH) - dilatedkSizeU) + 1; + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across output width */ + { + int32_t remX = XT_MIN(vectorizationWidth, outW - x); + + /* If (remX + kSizeEffU - 1) <= 2 * XCHAL_IVPN_SIMD_WIDTH, + * i.e. if the number of input data bytes corresponding to remX number of outputs + * is less than or equal to 2 * XCHAL_IVPN_SIMD_WIDTH, there is no need to load + * the next 64 input bytes*/ + int32_t remXLoad = ((remX + dilatedkSizeU - 1) > 2 * XCHAL_IVPN_SIMD_WIDTH) ? 1 : 0; + + for (y = 0; y < outH; y++) /* Loop across output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * y + x]; + + /* initialize coeff and Bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh++) /* Loop across Output depth */ + { + /* load and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2; + dacc1 = dacc2 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc2, hvecBias1, hvecBias1); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecIn = (MORPH_IDT_2Nx8 *) pInput; + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* vectors for coeff and input loads */ + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecInData11, dvecInData21, dvecInData31, dvecInData41, \ + dvecInData51, dvecInData61, dvecInData71; + xb_vec2Nx8 dvecInData12, dvecInData22, dvecInData32, dvecInData42, \ + dvecInData52, dvecInData62, dvecInData72; + + /* load data from first input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 3rd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData31, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData32, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData32, dvecInData31, dvecInData32, dvecInData31, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 4th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData41, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData42, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData42, dvecInData41, dvecInData42, dvecInData41, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 5th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData51, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData52, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData52, dvecInData51, dvecInData52, dvecInData51, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 6th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData61, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData62, vaInData, pdvecIn, dilationU * inDataPitch1 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData62, dvecInData61, dvecInData62, dvecInData61, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData62, dvecInData61, dvecInData62, dvecInData61, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load data from 7th input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn); + MORPH_OP_LOAD_2Nx8(dvecInData71, vaInData, pdvecIn, remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_OP_LOAD_2Nx8(dvecInData72, vaInData, pdvecIn, inDataPitch2 - \ + remXLoad * 2 * XCHAL_IVPN_SIMD_WIDTH - dilationU * 6 * inDataPitch1); + + /*Separate odd and even indices */ + IVP_DSEL2NX8I(dvecInData72, dvecInData71, dvecInData72, dvecInData71, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData72, dvecInData71, dvecInData72, dvecInData71, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 1st row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 1st row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData11, dvecInData11, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData12, dvecInData12, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 2nd row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 2nd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 2nd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData21, dvecInData21, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData22, dvecInData22, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 3rd row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 3rd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData31, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData32, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 3rd row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData31, dvecInData31, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData32, dvecInData32, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 4th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 4th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData41, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData42, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 4th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData41, dvecInData41, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData42, dvecInData42, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 5th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 5th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData51, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData52, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 5th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData51, dvecInData51, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData52, dvecInData52, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 6th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 6th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData61, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData62, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 6th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData61, dvecInData61, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData62, dvecInData62, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + /* load 7th row of coefficients */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* Multiply and accumulate 1st set of 4 coefficients from 7th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, dvecInData71, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecInData72, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + /* Multiply and accumulate 2nd set of 4 coefficients from 7th row for all the outputs */ + MORPH_OP_MUL4TA(dacc1, 0, IVP_SEL2NX8I(dvecInData71, dvecInData71, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, IVP_SEL2NX8I(dvecInData72, dvecInData72, \ + IVP_SELI_8B_ROTATE_RIGHT_4), IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L; + xb_vec2Nx8 dvecOut1H, dvecOut2H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + if (!typeFlag) + { + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else + { + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + } + + /* Storing the first depth output , 1st row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remX); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remX - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * remX - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += outDataPitch2 * bytesPerPixel; + pCoeff += coeffPitch3; + } /* end of for (outCh = 0; outCh < numOutCh; outCh++)*/ + } /* end of for (y = 0; y < outH; y++)*/ + } /* end of for (x = 0; x < outW; x += 2 * vectorizationWidth)*/ + return(XAI_ERROR_STATUS()); +} + + +/****************************************************************************************** +* MxN MOW WHD Stride 1 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj1d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + uint8_t leftEdge, topEdge; + if ((kSizeX % 2) != 0) + { + leftEdge = kSizeX / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1); + } + + if ((kSizeY % 2) != 0) + { + topEdge = kSizeY / 2; + } + else + { + topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn1; + MORPH_IDT_2Nx8 * restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t outCh, x, y, ky; + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + int32_t varLen; + + if (kSizeX > 12) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + /***************************** 1st inCh ****************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /***************************** 2nd inCh ****************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /***************************** 3rd inCh ****************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + /************************** 1st inCh *******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /************************** 2nd inCh *******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /************************** 3rd inCh *******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 4) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + /******************************* 1st inCh ************************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /******************************* 2nd inCh ************************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /******************************* 3rd inCh ************************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + /************************* 1st inCh *******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /************************* 2nd inCh *******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /************************* 3rd inCh *******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_MxNj1d1I8S8IX_MOW_WHD +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate MxN 3D dilated convolution function and MxN */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_MxNj1d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_MxNj1d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_MxNj1d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_MxNj1d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "Kernel width = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile)); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "\nKernel height = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj1d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + uint8_t leftEdge, topEdge; + if ((kSizeX % 2) != 0) + { + leftEdge = kSizeX / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1); + } + + if ((kSizeY % 2) != 0) + { + topEdge = kSizeY / 2; + } + else + { + topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn1; + MORPH_IDT_2Nx8 * restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + int32_t varLen; + + if (kSizeX > 12) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 4) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh ], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + /* dvecInData11 refers to 1st input row, first 64(or lesser) elements + * and dvecInData12 refers to next few left out elements of the same row + * required to compute one 64 way output vector(To compute one 64 way + * output vector, we require 64 + edge1 + edge2 number of input elements) + */ + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads 2nd input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc21, dvecInData12, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + MORPH_OP_MUL4TA(dacc12, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData22, dvecInData21, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 1st depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndRow * outDataPitch1 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * enable2ndRow * varLen); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + enable2ndRow * enable2ndCh * 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MxN MOW WHD Stride 2 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj2d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + uint8_t leftEdge, topEdge; + if ((kSizeX % 2) != 0) + { + leftEdge = kSizeX / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1); + } + + if ((kSizeY % 2) != 0) + { + topEdge = kSizeY / 2; + } + else + { + topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + MORPH_IDT_2Nx8* restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t outCh, x, y, ky; + int32_t varLen; + + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1; + + if (kSizeX > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depth */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecCoeffData2; + + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + /************************* 1st inCh *****************************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...127 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecInData1 : 1, 3, 5,...121 */ + /* dvecInData2 : 2, 4, 6,...122 */ + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* right rotate the input vectors + * in order to multiply with next columns of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + + /* multiply 1st input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* multiply 2nd input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* multiply 1st input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + /* multiply 2nd input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /************************* 2nd inCh *****************************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \ + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* right rotate the input vectors + * in order to multiply with next columns of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + + /* multiply 1st input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* multiply 2nd input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* multiply 1st input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + /* multiply 2nd input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /************************* 3rd inCh *****************************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \ + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* right rotate the input vectors + * in order to multiply with next columns of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + + /* multiply 1st input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* multiply 2nd input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* multiply 1st input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + /* multiply 2nd input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L; + xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output channel, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \ + varLen * enable2ndCh * enable2Row); + IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depth */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecCoeffData2; + + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + /****************************** 1st inCh ******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...127 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecInData1 : 1, 3, 5,...121 */ + /* dvecInData2 : 2, 4, 6,...122 */ + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /****************************** 2nd inCh ******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \ + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /****************************** 3rd inCh ******************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \ + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L; + xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output channel, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \ + varLen * enable2ndCh * enable2Row); + IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_MxNj2d1I8S8IX_MOW_WHD +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate MxN 3D dilated convolution function and MxN */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 2 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_MxNj2d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_MxNj2d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_MxNj2d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_MxNj2d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj2d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "Kernel width = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile)); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "\nKernel height = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 2); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj2d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + uint8_t leftEdge, topEdge; + if ((kSizeX % 2) != 0) + { + leftEdge = kSizeX / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1); + } + + if ((kSizeY % 2) != 0) + { + topEdge = kSizeY / 2; + } + else + { + topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8* restrict pdvecIn1; + MORPH_IDT_2Nx8* restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + int32_t varLen; + + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1; + + if (kSizeX > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depth */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecCoeffData2; + + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...127 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecInData1 : 1, 3, 5,...121 */ + /* dvecInData2 : 2, 4, 6,...122 */ + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* right rotate the input vectors + * in order to multiply with next columns of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData21 = IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData22 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData22, IVP_SELI_8B_ROTATE_RIGHT_4); + + + /* multiply 1st input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + /* multiply 2nd input row with next 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + + /* multiply 1st input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + + /* multiply 2nd input row with next 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 9)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 9)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L; + xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output channel, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \ + varLen * enable2ndCh * enable2Row); + IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2Row = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and Bias data pointer */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* In order to handle odd output depth */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + xb_vec2Nx8 dvecCoeffData1; + xb_vec2Nx8 dvecCoeffData2; + + xb_vec2Nx8 dvecInData11, dvecInData12; + xb_vec2Nx8 dvecInData21, dvecInData22; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2Row); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + IVP_LA2NX8_XP(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData12, vaInData, pdvecIn1, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + IVP_LA2NX8_XP(dvecInData21, vaInData, pdvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LA2NX8_XP(dvecInData22, vaInData, pdvecIn2, \ + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...127 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* dvecInData1 : 1, 3, 5,...121 */ + /* dvecInData2 : 2, 4, 6,...122 */ + + IVP_DSEL2NX8I(dvecInData12, dvecInData11, + dvecInData12, dvecInData11, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, + dvecInData22, dvecInData21, IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1st row of coeffs for both output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* rearrange the coeff vectors. Separate even and odd coeff + * so that MUL4T can be used + */ + IVP_DSEL2NX8I(dvecCoeffData2, dvecCoeffData1, + dvecCoeffData2, dvecCoeffData1, IVP_DSELI_8B_DEINTERLEAVE_1); + + + /* multiply 1st input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc11, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc11, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 2nd input row with 1st 8 coeff from 1st output channel*/ + MORPH_OP_MUL4TA(dacc12, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc12, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + /* multiply 1st input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc21, 0, dvecInData11, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc21, 0, dvecInData12, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + + /* multiply 2nd input row with 1st 8 coeff from 2nd output channel*/ + MORPH_OP_MUL4TA(dacc22, 0, dvecInData21, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 8)); + MORPH_OP_MUL4TA(dacc22, 0, dvecInData22, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 8)); + } /* for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut11L, dvecOut12L, dvecOut21L, dvecOut22L; + xb_vec2Nx8 dvecOut11H, dvecOut12H, dvecOut21H, dvecOut22H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut11L, dvecOut11H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut12L, dvecOut12H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut21L, dvecOut21H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut22L, dvecOut22H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable length for output stores */ + varLen = XT_MIN(vectorizationWidth, outW - x); + + /* Storing the first output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut11L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut11H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first output channel, second row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2Row * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut12L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2Row); + IVP_SAV2NX8_XP(dvecOut12H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut21L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut21H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd output channel, 2nd row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2Row) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut22L, vaOutData, pdvecOut, bytesPerPixel * \ + varLen * enable2ndCh * enable2Row); + IVP_SAV2NX8_XP(dvecOut22H, vaOutData, pdvecOut, typeFlag * \ + 2 * (varLen - XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh * enable2Row); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* for (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* MxN MOW WHD Stride 4 - DEPTH 3 * +* If number of input channels is equal to 3 * +* this function is called. * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj4d1), S8IX_MOW_WHD_DEPTH3) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + uint8_t leftEdge, topEdge; + if ((kSizeX % 2) != 0) + { + leftEdge = kSizeX / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1); + } + + if ((kSizeY % 2) != 0) + { + topEdge = kSizeY / 2; + } + else + { + topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, \ + * restrict pdvecCoeff3, * restrict pdvecCoeff4; + + /* Variable Declarations */ + int32_t outCh, x, y, ky; + int32_t varLen; + + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1; + + /* Vearable count to handle the last iteration + * of X loop seprately if only 1 i/p load is + * sufficient + */ + const int32_t remX = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1; + + /* generates the shuffle sequence for the coeff, so that MUL4T can be used. + * Rearranges coeff from c0,c1,..c13,c14 in the following manner: + * + * c0,c4,c8,c12 + * c1,c5,c9,c13 + * c2,c6,c10,c14 + * c3,c7,c11,0 + * */ + xb_vec2Nx8 dvecIdx = IVP_SEQ2NX8(); + xb_vec2Nx8 dvec1, dvec2; + IVP_DSEL2NX8I(dvec2, dvec1, 0, dvecIdx, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecIdx, dvec1, 0, dvec1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvec1 = IVP_SEL2NX8I(dvecIdx, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO); + IVP_DSEL2NX8I(dvecIdx, dvec2, 0, dvec2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvec2 = IVP_SEL2NX8I(dvecIdx, dvec2, IVP_SELI_8B_INTERLEAVE_4_LO); + dvecIdx = IVP_SEL2NX8I(dvec2, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO); + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the */ + for (x = 0; x < outW - remX; x += vectorizationWidth) /* Loop across Output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd height*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + /************************** 1st inCh **************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /************************** 2nd inCh **************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /************************** 3rd inCh **************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + if (x < outW) + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd height*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + + MORPH_IDT_2Nx8 dvecInData11; + MORPH_IDT_2Nx8 dvecInData21; + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + /**************************** 1st inCh ***************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1); + + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /**************************** 2nd inCh ***************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /**************************** 3rd inCh ***************************/ + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + 2 * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1); + + /* Arrange input vectors required for Quad multiply*/ + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } +} + +/****************************************************************************************** + convolved3D_S_11x11j4d1_S8S8IX_MOW_WHD + convolved3D_S_11x11j4d1_U8S8IX_MOW_WHD + convolvedVQ3D_S_11x11j4d1_S8S8IX_MOW_WHD + convolvedVQ3D_S_11x11j4d1_U8S8IX_MOW_WHD +* 11x11 MOW WHD Stride 4 dilation -1 * +******************************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_11x11j4d1), S8IX_MOW_WHD) \ + MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeU = XAI_TILE4D_GET_DIM1(coeffTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-((kSizeU / 2) * inDataPitch1 + (kSizeU / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, *restrict pdvecCoeff2, \ + * restrict pdvecCoeff3, *restrict pdvecCoeff4; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + int32_t varLen; + + /* variable declarations for input and coeff vectors */ + MORPH_IDT_2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3; + MORPH_IDT_2Nx8 dvecCoeffData4, dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + MORPH_IDT_2Nx8 dvecData5, dvecData6, dvecData7, dvecData8; + MORPH_IDT_2Nx8 dvecData9, dvecData10, dvecData11; + + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* Vearable count to handle the last iteration + * of X loop seprately if only 1 i/p load is + * sufficient + */ + const int32_t remX = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeU) / stride) + 1; + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the */ + for (x = 0; x < outW - remX; x += vectorizationWidth) /* Loop across Output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd height*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < 11; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, + inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData22, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData8 = IVP_SEL2NX8I(0, dvecData4, IVP_SELI_8B_ROTATE_RIGHT_1); + + dvecData9 = IVP_SEL2NX8I(0, dvecData5, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData10 = IVP_SEL2NX8I(0, dvecData6, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData11 = IVP_SEL2NX8I(0, dvecData7, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load 1 row of coeff for all the the 4 output channels */ + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 1st output channel and accumulate */ + + /* IVP_EXTRVRN_2X32 extracts the required coeff from the + * coeff vector. In every iteration ky is updated therefore it + * extracts coeff from the next coeff row in the successive ky + * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to + * first four coeff in a row + */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + MORPH_OP_MULQA(dacc1, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MULQA(dacc1, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 2nd output channel and accumulate */ + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + MORPH_OP_MULQA(dacc2, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + MORPH_OP_MULQA(dacc2, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 3rd output channel and accumulate */ + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + MORPH_OP_MULQA(dacc3, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + MORPH_OP_MULQA(dacc3, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 4th output channel and accumulate */ + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + MORPH_OP_MULQA(dacc4, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + MORPH_OP_MULQA(dacc4, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + + if (x < outW) + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd height*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < 11; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1); + + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...63, and the 2nd input row is + * 64, 65, 66, 67.........124,125,126,127, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124 + * dvecData2 : 1, 5, 9,...121,125 + * dvecData3 : 2, 6,10,...122,126 + * dvecData4 : 3, 7,11,...123,127 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + IVP_DSEL2NX8I(dvecInData12, dvecInData11, dvecInData11, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + IVP_DSEL2NX8I(dvecInData22, dvecInData21, dvecInData21, dvecInData21, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + IVP_DSEL2NX8I(dvecData3, dvecData1, dvecInData21, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + IVP_DSEL2NX8I(dvecData4, dvecData2, dvecInData22, dvecInData12, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + dvecData5 = IVP_SEL2NX8I(0, dvecData1, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData6 = IVP_SEL2NX8I(0, dvecData2, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData7 = IVP_SEL2NX8I(0, dvecData3, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData8 = IVP_SEL2NX8I(0, dvecData4, IVP_SELI_8B_ROTATE_RIGHT_1); + + dvecData9 = IVP_SEL2NX8I(0, dvecData5, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData10 = IVP_SEL2NX8I(0, dvecData6, IVP_SELI_8B_ROTATE_RIGHT_1); + dvecData11 = IVP_SEL2NX8I(0, dvecData7, IVP_SELI_8B_ROTATE_RIGHT_1); + + /* load 1 row of coeff for all the the 4 output channels */ + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 1st output channel and accumulate */ + + /* IVP_EXTRVRN_2X32 extracts the required coeff from the + * coeff vector. In every iteration ky is updated therefore it + * extracts coeff from the next coeff row in the successive ky + * iterations. "ky * coeffPitch1 + 4" extracts coeffs next to + * first four coeff in a row + */ + MORPH_OP_MULQA(dacc1, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + + MORPH_OP_MULQA(dacc1, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + + MORPH_OP_MULQA(dacc1, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 2nd output channel and accumulate */ + MORPH_OP_MULQA(dacc2, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + + MORPH_OP_MULQA(dacc2, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + + MORPH_OP_MULQA(dacc2, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 3rd output channel and accumulate */ + MORPH_OP_MULQA(dacc3, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + + MORPH_OP_MULQA(dacc3, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + + MORPH_OP_MULQA(dacc3, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + + /* multiplies data from two rows(Lower and upper half of dvecData) + * with coeff from 4th output channel and accumulate */ + MORPH_OP_MULQA(dacc4, dvecData4, dvecData3, dvecData2, dvecData1, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + MORPH_OP_MULQA(dacc4, dvecData8, dvecData7, dvecData6, dvecData5, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + MORPH_OP_MULQA(dacc4, 0, dvecData11, dvecData10, dvecData9, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_MxNj4d1I8S8IX_MOW_WHD +* ***************************************************************************************/ + +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during preprocessing stage. This method can be */ +/* used to generate MxN 3D dilated convolution function and MxN */ +/* 3D VQ dilated convolution function for U8 bit and S8 bit */ +/* input data with input stride equal to 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_MxNj4d1_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_MxNj4d1_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_MxNj4d1_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_MxNj4d1_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj4d1), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + MORPH_IDT_CHECK(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "kernel width = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile)); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "\nkernel height = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %u and height = %u\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 4); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %u and height = %u\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %u, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %u, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + if (XAI_TILE4D_GET_DIM1(coeffTile) == 11 && XAI_TILE4D_GET_DIM2(coeffTile) == 11 && + XAI_CNN_CONV_GET_STRIDEX(param) == 4) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_11x11j4d1), S8IX_MOW_WHD) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + if (XAI_TILE3D_GET_DIM3(inTile) == 3) + { + MAKE_NAME(MAKE_NAME_VQ(convolved, 3D_S_MxNj4d1), S8IX_MOW_WHD_DEPTH3) MAKE_PARAMS(inTile, coeffTile, biasArray, outTile, param); + return(XAI_ERROR_STATUS()); + } + + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + uint8_t leftEdge, topEdge; + if ((kSizeX % 2) != 0) + { + leftEdge = kSizeX / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kSizeX / 2) : ((kSizeX / 2) - 1); + } + + if ((kSizeY % 2) != 0) + { + topEdge = kSizeY / 2; + } + else + { + topEdge = topEdgeFlag ? (kSizeY / 2) : ((kSizeY / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 *restrict pdvecIn1; + MORPH_IDT_2Nx8 *restrict pdvecIn2; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2, \ + * restrict pdvecCoeff3, * restrict pdvecCoeff4; + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + int32_t varLen; + + /* Number of output elements that can be generated + * with 2 input vector loads(64 way).*/ + const int32_t vectorizationWidth = (((4 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1; + + /* Vearable count to handle the last iteration + * of X loop seprately if only 1 i/p load is + * sufficient + */ + const int32_t remX = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kSizeX) / stride) + 1; + + /* generates the shuffle sequence for the coeff, so that MUL4T can be used. + * Rearranges coeff from c0,c1,..c13,c14 in the following manner: + * + * c0,c4,c8,c12 + * c1,c5,c9,c13 + * c2,c6,c10,c14 + * c3,c7,c11,0 + * */ + xb_vec2Nx8 dvecIdx = IVP_SEQ2NX8(); + xb_vec2Nx8 dvec1, dvec2; + IVP_DSEL2NX8I(dvec2, dvec1, 0, dvecIdx, IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecIdx, dvec1, 0, dvec1, IVP_DSELI_8B_DEINTERLEAVE_1); + dvec1 = IVP_SEL2NX8I(dvecIdx, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO); + IVP_DSEL2NX8I(dvecIdx, dvec2, 0, dvec2, IVP_DSELI_8B_DEINTERLEAVE_1); + dvec2 = IVP_SEL2NX8I(dvecIdx, dvec2, IVP_SELI_8B_INTERLEAVE_4_LO); + dvecIdx = IVP_SEL2NX8I(dvec2, dvec1, IVP_SELI_8B_INTERLEAVE_4_LO); + + /* loop across output depth is unrolled by 4 + * , producing lanes from 4 output channels + * in one iteration. Since vectorization width + * is just half the width of the accumulator, + * loop across output height is also unrolled by 2. + * Unrolling across output height makes it possible + * to utilize all the 64 MACs in the accumulator. + * + * Data loaded from the 2 input rows is concatenated + * in such a manner that lower half of the output + * vector gives the first output row and the upper + * half of the */ + for (x = 0; x < outW - remX; x += vectorizationWidth) /* Loop across Output width */ + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd height*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + + MORPH_IDT_2Nx8 dvecInData11, dvecInData12; + MORPH_IDT_2Nx8 dvecInData21, dvecInData22; + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8_IP(dvecInData11, vaInData, pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8_IP(dvecInData21, vaInData, pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData22, vaInData, pdvecIn2, inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH); + + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData22, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + if (x < outW) + { + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd height*/ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * stride * y + stride * x]; + + /* initialize coeff and Bias data pointer to */ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 4) /* Loop across Output depth */ + { + /* In order to handle odd depths*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + int32_t enable3rdCh = XT_SALT(outCh, numOutCh - 2); + int32_t enable4thCh = XT_SALT(outCh, numOutCh - 3); + + /* loads and replicate bias data */ + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4 * enable3rdCh); + xb_vecN_2x32v hvecBias3; IVP_LSRN_2X32_XP(hvecBias3, pBias, 4 * enable4thCh); + xb_vecN_2x32v hvecBias4; IVP_LSRN_2X32_XP(hvecBias4, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc1, dacc2, dacc3, dacc4; + dacc1 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc1, hvecBias1, hvecBias1); + dacc2 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc2, hvecBias2, hvecBias2); + dacc3 = IVP_CVT24UNX32L(hvecBias3, hvecBias3); + IVP_CVT24UNX32H(dacc3, hvecBias3, hvecBias3); + dacc4 = IVP_CVT24UNX32L(hvecBias4, hvecBias4); + IVP_CVT24UNX32H(dacc4, hvecBias4, hvecBias4); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff + 2 * coeffPitch3 * enable3rdCh); + valign vaCoeffData3; vaCoeffData3 = IVP_LA2NX8_PP(pdvecCoeff3); + + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff + 3 * coeffPitch3 * enable4thCh); + valign vaCoeffData4; vaCoeffData4 = IVP_LA2NX8_PP(pdvecCoeff4); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vec2Nx8 dvecCoeffData1, dvecCoeffData2, dvecCoeffData3, dvecCoeffData4; + + MORPH_IDT_2Nx8 dvecInData11; + MORPH_IDT_2Nx8 dvecInData21; + + MORPH_IDT_2Nx8 dvecData1, dvecData2, dvecData3, dvecData4; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + pdvecIn2 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, inDataPitch1); + + /* loads 5th(corresponding to the 2nd output row) input row */ + vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn2); + MORPH_OP_LOAD_2Nx8(dvecInData21, vaInData, pdvecIn2, inDataPitch1); + + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,4,5,6,7,8,9,...127, and the 2nd input row is + * 128,129,130,131.........252,253,254,255, Data should be arranged as + * + * dvecData1 : 0, 4, 8,...120,124,128,132,136,...248,252 + * dvecData2 : 1, 5, 9,...121,125,129,133,137,...249,253 + * dvecData3 : 2, 6,10,...122,126,130,134,138,...250,254 + * dvecData4 : 3, 7,11,...123,127,131,135,139,...251,255 + * + * Lower half of the vectors contain data from 1st input row and + * upper half of the vectors contain data from 2nd output row. + * + */ + + IVP_DSEL2NX8I(dvecData2, dvecData1, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_0), + IVP_DSELI_8B_DEINTERLEAVE_1); + IVP_DSEL2NX8I(dvecData4, dvecData3, + IVP_SEL2NX8I(dvecInData21, dvecInData21, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_SEL2NX8I(dvecInData11, dvecInData11, IVP_SELI_8B_EXTRACT_2_OF_4_OFF_2), + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for all the the 4 output channels */ + IVP_LAV2NX8_XP(dvecCoeffData1, vaCoeffData1, pdvecCoeff1, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData2, vaCoeffData2, pdvecCoeff2, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData3, vaCoeffData3, pdvecCoeff3, coeffPitch1); + IVP_LAV2NX8_XP(dvecCoeffData4, vaCoeffData4, pdvecCoeff4, coeffPitch1); + + /* shuffles the coeff in desired manner */ + dvecCoeffData1 = IVP_SHFL2NX8(dvecCoeffData1, dvecIdx); + dvecCoeffData2 = IVP_SHFL2NX8(dvecCoeffData2, dvecIdx); + dvecCoeffData3 = IVP_SHFL2NX8(dvecCoeffData3, dvecIdx); + dvecCoeffData4 = IVP_SHFL2NX8(dvecCoeffData4, dvecIdx); + + /* mulitples coeff c0,c4,c8,c12 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 0)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 0)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 0)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData1, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 0)); + + /* mulitples coeff c1,c5,c9,c13 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 1)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 1)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 1)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData2, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 1)); + + /* mulitples coeff c2,c6,c10,c14 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 2)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 2)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 2)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData3, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 2)); + + /* mulitples coeff c3,c7,c11,0 with input data */ + MORPH_OP_MUL4TA(dacc1, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData1)), 3)); + MORPH_OP_MUL4TA(dacc2, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData2)), 3)); + MORPH_OP_MUL4TA(dacc3, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData3)), 3)); + MORPH_OP_MUL4TA(dacc4, 0, dvecData4, IVP_EXTRN_2X32( \ + IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData4)), 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + pOutScaleData[outCh + 2 * enable3rdCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + pOutScaleData[outCh + 3 * enable4thCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* store the first half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + */ + + /* Storing the first row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + enable2ndCh * outDataPitch2 * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 2 * outDataPitch2 * enable3rdCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable3rdCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the first row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + 3 * outDataPitch2 * enable4thCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable4thCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* extract the half of the output vectors + * dvecOut1, dvecOut2, dvecOut3, dvecOut4 + * and store in the next row + */ + + /* Storing the 2nd row outputs, first channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch1 * enable2ndRow * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut1L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * varLen * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 2nd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (outDataPitch2 * enable2ndCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut2L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable2ndCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable2ndCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 3rd channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (2 * outDataPitch2 * enable3rdCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut3L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable3rdCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable3rdCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the 2nd row outputs, 4th channel */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + (3 * outDataPitch2 * enable4thCh + \ + outDataPitch1 * enable2ndRow) * bytesPerPixel); + IVP_SAV2NX8_XP(IVP_SHFL2NX8I(dvecOut4L, IVP_SHFLI_8B_SWAP_32), vaOutData, pdvecOut, \ + (-typeFlag + 1) * varLen * enable4thCh * enable2ndRow); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + varLen * enable4thCh * enable2ndRow); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 4 * outDataPitch2 * bytesPerPixel; + pCoeff += 4 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 4)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_MxNj1d2I8S8IX_MOW_WHD +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution */ +/* with dilation = 2. Based on MORPH pre-processor specifiers, */ +/* code implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN 3D dialted */ +/* convolution function and MxN 3D VQ dialted convolution */ +/* function for U8 bit and S8 bit input data with input stride */ +/* equal to 1. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_MxNj1d2_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_MxNj1d2_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_MxNj1d2_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_MxNj1d2_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d2), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "Kernel width = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile)); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "\nKernel height = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 2); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Since the dilation value > 1 , */ + /* Effective Kernel size = dilation(KernelSize - 1) + 1 */ + /* Effective kernel size is used for calculating the min required edge */ + int32_t dilatedKSizeX = dilationU * (kSizeX - 1) + 1; + int32_t dilatedKSizeY = dilationU * (kSizeY - 1) + 1; + + /* For dilation equal to 2 dilated width and height will always be odd */ + /* Condition check to evaluate left or right alignment of kernel based */ + /* on the edge flag is not required. */ + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-((dilatedKSizeY / 2) * inDataPitch1 + (dilatedKSizeX / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn1; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + /* Generating two select interleave pattern to apply on accumulator values just before storing + * For 8 bit output + * Pattern1 = 0 64 1 65 2 66 .... 31 95 + * Pattern2 = 32 96 33 97 34 98 ... 63 127 + * For 16 bit output + * Pattern1 = 0 1 64 65 2 3 66 67 .... 30 31 94 95 + * Pattern2 = 32 33 96 97 34 35 98 99 ... 62 63 126 127 + */ + /* 0 1 2 3 .. 62 63*/ + xb_vec2Nx8 dvecPattern1 = IVP_SEQ2NX8(); + /* 64 65 66 67 ...126 127*/ + xb_vec2Nx8 dvecPattern2 = IVP_ADD2NX8(dvecPattern1, 64); + + if (!typeFlag) + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else + { + MORPH_OP_DSELI(dvecPattern2, dvecPattern1, \ + dvecPattern2, dvecPattern1, \ + IVP_DSELI_INTERLEAVE_1); + } + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeX + 1; + int32_t varLen; + + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration. + * + * Load 128 input bytes from row corresponding to each ky + * dvecInData11 = a0 a1 a2 a3.... a63 + * dvecInData12 = a64 a65 a66 .... a127 + * + * Separate odd and even indices + * dvecInData11 = a0 a2 a4 a6.... a126 + * dvecInData12 = a1 a3 a5 a7.... a127 + * + * Let the coefficients be + * C11 C12 C13 ... C1kW + * C21 C22 C23 ... C2kW + * . + * . + * CkH1 CkH2 CkH3 ... CkHkW + * + * acc11 = [a0 a2 a4 a6.... a126] * C11 + + * [a2 a4 a6.... a126 X ] * C12 + + * [a4 a6.... a126 X X ] * C13 + + * . + * . + * [ ] * C1kW + * + * acc12 = [a1 a3 a5 a7.... a127] * C11 + + * [a3 a5 a7.... a127 X ] * C12 + + * [a5 a7.... a127 X X ] * C13 + + * . + * . + * [ ] * C1kW + * + * Continue the same multiplication steps for ky = 1 to kHeight -1 . + * acc11 and acc12 contains convolved output corresponding to even and odd indices + * respectively at the end of inchannel loop iterations. + * + * acc11 and acc12 are interleaved to obtain the outputs in correct order. + * + */ + + if (kSizeX > 12) + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L; + xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1); + xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2); + xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1); + xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2); + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 8) + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L; + xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1); + xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2); + xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1); + xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2); + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 4) + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L; + xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1); + xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2); + xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1); + xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2); + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvec1L, dvec2L, dvec3L, dvec4L; + xb_vec2Nx8 dvec1H, dvec2H, dvec3H, dvec4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec1L, dvec1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec2L, dvec2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec3L, dvec3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvec4L, dvec4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Interleave odd and even indices */ + xb_vec2Nx8 dvecOut1L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern1); + xb_vec2Nx8 dvecOut2L = MORPH_OP_SEL(dvec2L, dvec1L, dvecPattern2); + xb_vec2Nx8 dvecOut1H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern1); + xb_vec2Nx8 dvecOut2H = MORPH_OP_SEL(dvec2H, dvec1H, dvecPattern2); + xb_vec2Nx8 dvecOut3L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern1); + xb_vec2Nx8 dvecOut4L = MORPH_OP_SEL(dvec4L, dvec3L, dvecPattern2); + xb_vec2Nx8 dvecOut3H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern1); + xb_vec2Nx8 dvecOut4H = MORPH_OP_SEL(dvec4H, dvec3H, dvecPattern2); + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************************** +* xaiConvolved(VQ)3D_S_MxNj1d4I8S8IX_MOW_WHD +* ***************************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution */ +/* with dilation = 4. Based on MORPH pre-processor specifiers, */ +/* code implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN 3D dialted */ +/* convolution function and MxN 3D VQ dialted convolution */ +/* function for U8 bit and S8 bit input data with input stride */ +/* equal to 1. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolvedVQ3D_S_MxNj1d4_S8S8IX_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_MxNj1d4_U8S8IX_MOW_WHD ******************/ +/******************* xaiConvolved3D_S_MxNj1d4_S8S8IX_MOW_WHD *******************/ +/******************* xaiConvolved3D_S_MxNj1d4_U8S8IX_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d4), S8IX_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "Kernel width = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile)); + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(coeffTile) <= 16, XAI_ERR_KSIZE, \ + "\nKernel height = %d, which should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and height = %hhu\nStride along width and height should be equal", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_DILATION(param, 4); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kSizeX = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kSizeY = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR* pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t* pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t* pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int8_t* pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + /* Since the dilation value > 1 , */ + /* Effective Kernel size = dilation(KernelSize - 1) + 1 */ + /* Effective kernel size is used for calculating the min required edge */ + int32_t dilatedKSizeX = dilationU * (kSizeX - 1) + 1; + int32_t dilatedKSizeY = dilationU * (kSizeY - 1) + 1; + + /* For dilation equal to 4 dilated width and height will always be odd */ + /* Condition check to evaluate left or right alignment of kernel based */ + /* on the edge flag is not required. */ + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-((dilatedKSizeY / 2) * inDataPitch1 + (dilatedKSizeX / 2))]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + MORPH_IDT_2Nx8 * restrict pdvecIn1; + xb_vec2Nx8* restrict pdvecOut; + xb_vec2Nx8* restrict pdvecCoeff1, * restrict pdvecCoeff2; + + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + const int32_t vectorizationWidth = 4 * XCHAL_IVPN_SIMD_WIDTH - dilatedKSizeX + 1; + int32_t varLen; + + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration. + * + * Load 128 bytes from row corresponding to each ky + * dvecInData11 = a0 a1 a2 a3 ... a63 + * dvecInData12 = a64 a65 a66 ... a127 + * + * Deinterleave the indices + * dvecInData11 = a0 a2 a4 a6 ... a126 + * dvecInData12 = a1 a3 a5 a7 ... a127 + * + * Deinterleave the indices + * dvecInData11 = a0 a4 a8 ... a124 ... a1 a5 ... a125 + * dvecInData12 = a2 a6 a10 ... a126 ... a3 a7 ... a127 + * + * Let the coefficients be + * C11 C12 C13 ... C1kW + * C21 C22 C23 ... C2kW + * . + * . + * CkH1 CkH2 CkH3 ... CkHkW + * + * dacc11 = [a0 a4 a8 ... a124 ... a1 a5 ... a125] * C11 + + * [a4 a8 ... a124 ... a1 a5 ... a125 X ] * C12 + + * [a8 ... a124 ... a1 a5 ... a125 X X ] * C13 + * . + * . + * [ ] * C1kW + * + * dacc12 = [a2 a6 a10 ... a126 ... a3 a7 ... a127] * C0 + + * [a6 a10 ... a126 ... a3 a7 ... a127 X ] * C1 + + * [a10 ... a126 ... a3 a7 ... a127 X X ] * C2 + + * . + * . + * [ ] * C1kW + * + * + * Continue the same multiplication steps for ky = 1 to kHeight -1 . + * dacc11 and dacc12 contains convolved output corresponding to even and odd indices + * respectively at the end of inchannel loop iterations. + * + * acc11 and acc12 are interleaved to obtain the outputs in correct order. + * Pack, Shift scale and clamp dacc11 and dacc12 to obtain dvecOut1L , dvecOut1H, dvecOut2L and dvecOut2H + * + * For 8bit output, dvecOutL contains the required output elements + * dvecOut1L = [A0 A4 A8 ... A116 X X A1 A5 ... A117 X X] - 64 elements + * dvecOut2L = [A2 A6 A10 ...A118 X X A3 A7 ... A119 X X] - 64 elements + * Interleave the elements + * dvecOut1L = [A0 A2 A4 ... A116 A117 X X X X ] - 64 elements + * dvecOut2L = [A1 A3 A7 ... A118 A119 X X X X ] - 64 elements + * Interleave the elements + * dvecOut1L = [A0 A1 A2 A3 ... ]- 64 elements + * dvecOut2L = [ ... A116 A117 A118 A119 X X X X X X X X ]- 64 elements + * + * + * For 16bit output + * dvecOut1L = [A0 A4 A8 ... A116 X X] - 32 16b elements + * dvecOut1H = [A1 A5 A9 ... A117 X X] - 32 16b elements + * dvecOut2L = [A2 A6 A10 ... A118 X X] - 32 16b elements + * dvecOut2H = [A3 A7 A11 ... A119 X X] - 32 16b elements + * Interleave the elements of dvecOut1L and dvecOut1H + * dvecOut1L = [A0 A1 A4 A5 ... ] + * dvecOut1H = [ ... A116 A117 X X] + * Interleave the elements of dvecOut2L and dvecOut2H + * dvecOut2L = [A2 A3 A6 A7 ...] + * dvecOut2H = [ ... A118 A119 X X] + * Interleave2 the elements of dvecOut2L and dvecOut1L + * dvecOut1L = [A0 A1 A2 A3 ... ] + * dvecOut2L = [A32 A33 A34 A35 ... ] + * Interleave2 the elements of dvecOut2H and dvecOut1H + * dvecOut1H = [A64 A65 A66 A67 ... ] + * dvecOut2H = [ ... A116 A117 A118 A119 X X X X X X X X] + * + */ + + if (kSizeX > 12) + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 3)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 3)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 8 bit output */ + if (!typeFlag) + { + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else /* 16bit output */ + { + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \ + IVP_DSELI_INTERLEAVE_2); + } + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 8) + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 2)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 2)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 8 bit output */ + if (!typeFlag) + { + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else /* 16bit output */ + { + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \ + IVP_DSELI_INTERLEAVE_2); + } + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kSizeX > 4) + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + + /* right rotate the input vectors by 4 + * in order to multiply with next column of + * coeff in the next iteration + */ + dvecInData11 = IVP_SEL2NX8I(dvecInData12, dvecInData11, IVP_SELI_8B_ROTATE_RIGHT_4); + dvecInData12 = IVP_SEL2NX8I((xb_vec2Nx8) 0, dvecInData12, IVP_SELI_8B_ROTATE_RIGHT_4); + + /* multiples input data with next four coeffs from the same row */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 1)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 1)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 8 bit output */ + if (!typeFlag) + { + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else /* 16bit output */ + { + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \ + IVP_DSELI_INTERLEAVE_2); + } + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* 4 * XCHAL_IVPN_SIMD_WIDTH bytes of input are loaded at a time + * into two vectors. Also loop across output channels is unrolled twice, + * thereby producing four output vectors in 1 iteration + */ + + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(64, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int8_t *pOutput = &pOutData[(y * outDataPitch1 + x) * bytesPerPixel]; + + /* initialize input data pointer */ + MORPH_IDT_SCALAR *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int8_t *pCoeff = &pCoeffData[0]; + int32_t *pBias = &pBiasData[0]; + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + xb_vecN_2x32v hvecBias1; IVP_LSRN_2X32_XP(hvecBias1, pBias, 4 * enable2ndCh); + xb_vecN_2x32v hvecBias2; IVP_LSRN_2X32_XP(hvecBias2, pBias, 4); + + + /* wide vectors(accumulators) initialized with bias */ + xb_vec2Nx24 dacc11, dacc12, dacc21, dacc22; + + dacc12 = dacc11 = IVP_CVT24UNX32L(hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc11, hvecBias1, hvecBias1); + IVP_CVT24UNX32H(dacc12, hvecBias1, hvecBias1); + + dacc22 = dacc21 = IVP_CVT24UNX32L(hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc21, hvecBias2, hvecBias2); + IVP_CVT24UNX32H(dacc22, hvecBias2, hvecBias2); + + /* priming of coeff load is done outside the innermost loop*/ + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LA2NX8_PP(pdvecCoeff1); + + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LA2NX8_PP(pdvecCoeff2); + + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vec2Nx8 dvecCoeffData11; + xb_vec2Nx8 dvecCoeffData21; + + xb_vec2Nx8 dvecInData11, dvecInData12; + + pdvecIn1 = (MORPH_IDT_2Nx8 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kSizeY; ky++) /* Loop across kernel height */ + { + /* loads 128 bytes of input row */ + valign vaInData = MORPH_OP_PRIME_2Nx8(pdvecIn1); + MORPH_OP_LOAD_2Nx8(dvecInData11, vaInData, pdvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + MORPH_OP_LOAD_2Nx8(dvecInData12, vaInData, pdvecIn1, \ + dilationU * inDataPitch1 - 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + MORPH_OP_DSELI(dvecInData12, dvecInData11, dvecInData12, dvecInData11, \ + IVP_DSELI_8B_DEINTERLEAVE_1); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAV2NX8_XP(dvecCoeffData11, vaCoeffData1, pdvecCoeff1, coeffPitch1); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAV2NX8_XP(dvecCoeffData21, vaCoeffData2, pdvecCoeff2, coeffPitch1); + + /* multiples loaded input data with first four coeff */ + MORPH_OP_MUL4TA(dacc11, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + MORPH_OP_MUL4TA(dacc12, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData11)), \ + 0)); + + MORPH_OP_MUL4TA(dacc21, dvecInData11, dvecInData11, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + MORPH_OP_MUL4TA(dacc22, dvecInData12, dvecInData12, IVP_EXTRN_2X32 \ + (IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(dvecCoeffData21)), \ + 0)); + } /* end of for (ky = 0; ky < kSizeY; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#if DILATED_VQ_CONV == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim, typeFlag); +#elif DILATED_VQ_CONV == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, dacc11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, dacc12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, dacc21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, dacc22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* 8 bit output */ + if (!typeFlag) + { + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_8B_INTERLEAVE_1); + } + else /* 16bit output */ + { + MORPH_OP_DSELI(dvecOut1H, dvecOut1L, dvecOut1H, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2H, dvecOut2L, dvecOut2H, dvecOut2L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut2L, dvecOut1L, dvecOut2L, dvecOut1L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut2H, dvecOut1H, dvecOut2H, dvecOut1H, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut3H, dvecOut3L, dvecOut3H, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4H, dvecOut4L, dvecOut4H, dvecOut4L, \ + IVP_DSELI_INTERLEAVE_1); + MORPH_OP_DSELI(dvecOut4L, dvecOut3L, dvecOut4L, dvecOut3L, \ + IVP_DSELI_INTERLEAVE_2); + MORPH_OP_DSELI(dvecOut4H, dvecOut3H, dvecOut4H, dvecOut3H, \ + IVP_DSELI_INTERLEAVE_2); + } + + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * varLen); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * varLen - \ + 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Storing the second output depth, first row */ + pdvecOut = (xb_vec2Nx8 *) (pOutput + outDataPitch2 * enable2ndCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * varLen * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, ((bytesPerPixel * varLen) - \ + 2 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 4 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * \ + (2 * varLen - 6 * XCHAL_IVPN_SIMD_WIDTH) * enable2ndCh); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + pOutput += 2 * outDataPitch2 * bytesPerPixel; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y ++)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + + return(XAI_ERROR_STATUS()); +} + +//#endif +#endif /*if ((XCHAL_VISION_TYPE >= 6))*/ + + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c new file mode 100644 index 00000000000..7f722359d57 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV_S16 VQ_FALSE + +#include "cnn_dilated_conv_MOW_S16.h" +#endif //if ((XCHAL_VISION_TYPE >= 6)) + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h new file mode 100644 index 00000000000..9a4ee1a4dc1 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_MOW_S16.h @@ -0,0 +1,2948 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define VQ_TRUE 1 +#define VQ_FALSE 0 + +#undef MAKE_NAME_VQ +#undef MAKE_ARGUMENTS +#undef MAKE_PARAMS + +#if DILATED_VQ_CONV_S16 == VQ_TRUE + +#define MAKE_NAME_VQ(a, b) a ## VQ ## b +#define MAKE_ARGUMENTS(a, b, c, d, e) (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, const xai_pArray outputScaleArray, xai_pTile3D d, const xai_cnn_conv_params * e) +#define MAKE_PARAMS(a, b, c, d, e) (a, b, c, outputScaleArray, d, e) + +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + +#define MAKE_NAME_VQ(a, b) a ## b +#define MAKE_ARGUMENTS(a, b, c, d, e) (const xai_pTile3D a, const xai_pTile4D b, const xai_pArray c, xai_pTile3D d, const xai_cnn_conv_params * e) +#define MAKE_PARAMS(a, b, c, d, e) (a, b, c, d, e) +#endif + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix) name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, S16, suffix) + +/********************************************************************************* + ************** xaiConvolved(VQ)3D_S_MxNj1d1_S16S16I16_MOW_WHD ******************* + **********************************************************************************/ +/*********************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution. */ +/* Code implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN 3D dilated convolution */ +/* function and MxN 3D VQ dilated convolution function for S16 bit */ +/* input data with input stride equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S16 */ +/* biasArray is signed 64b, value not exceeding signed 48b */ +/* Output scale array is U16 */ +/* OutData is S16 / U16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/*********************************************************************************/ + +/****************** xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD *********************/ +/****************** xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD *******************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj1d1), S16I16_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_I16(outTile); + XAI_CHECK_TILE4D_S16(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S64(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= 16) && \ + (XAI_TILE4D_GET_DIM2(coeffTile) <= 16), \ + XAI_ERR_KSIZE, "\nKernel Width = %d and Kernel Height = %d\n \ + Kernel Width or Height should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile), XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "\nStride along width = %hhu and Stride along height = %hhu\n \ + Stride along width should be equal to stride along height", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_STRIDE(param, 1); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and Dilation along height = %hhu\n \ + Dilation along width should be equal to dilation along height", + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32, \ + XAI_ERR_NORM, "Accumulator shift value = %hhu\nThe accumulator shift value should be less than 32", + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "Output shift = %hhu\nThe output shift value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV_S16 == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } + +#if DILATED_VQ_CONV_S16 == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kWidthU = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + int16_t* pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t* pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t* pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int16_t* pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); +#if DILATED_VQ_CONV_S16 == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + int32_t leftEdge, topEdge; + int32_t minLim, maxLim; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0; + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX; + } + + /* Variable Declarations */ + int32_t outCh, x, y, ky, inCh; + + xb_vecNx16 * restrict pvecIn1; + xb_vecNx16 * restrict pvecIn2; + xb_vecNx16* restrict pvecOut; + xb_vecN_2x32v* restrict phvecCoeff1, *restrict phvecCoeff2; + xb_vec2Nx8 *restrict pdvecBias64; + + xb_vec2Nx8 seq1 = IVP_ADD2NX8(IVP_SEQ2NX8(), 2); + xb_vec2Nx8 seq2 = IVP_ADD2NX8(IVP_SEQ2NX8(), 34); + seq2 = IVP_MIN2NX8(seq2, 64); + xb_vec2Nx8 dvecSel = IVP_SEL2NX8I(seq2, seq1, IVP_SELI_8B_INTERLEAVE_1_LO); + /* Variable Declarations */ + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + int32_t varLen; + if (kWidthU > 12) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(32, inW - x); + + for (y = 0; y < outH; y++) /* Loop across Output height */ + { + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum21; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12, vecInData11A; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 6)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 6)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 7)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 7)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut3L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kWidthU > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(32, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12, vecInData11A; + xb_vecNx16 vecInData21, vecInData22, vecInData21A; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads 2nd input row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kWidthU > 4) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(32, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12, vecInData11A; + xb_vecNx16 vecInData21, vecInData22, vecInData21A; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads 2nd input row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + /* multiples loaded input data with first four coeff */ + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(32, inW - x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* handles odd output row */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * (y) + (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12, vecInData11A; + xb_vecNx16 vecInData21, vecInData22, vecInData21A; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + inDataPitch1 * enable2ndRow); + +#ifdef IS_VISION_130 + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + IVP_L2UNX16_XP(vecInData11, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_L2UNX16_XP(vecInData12, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads 2nd input row */ + IVP_L2UNX16_XP(vecInData21, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_L2UNX16_XP(vecInData22, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + } /* for (ky = 0; ky < kHeightU; ky++)*/ + +#else + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads 2nd input row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + vecInData11A = IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_8B_ROTATE_RIGHT_2); + vecInData21A = IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_8B_ROTATE_RIGHT_2); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData11A, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData21A, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + } /* for (ky = 0; ky < kHeightU; ky++)*/ +#endif + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************** + * xaiConvolved(VQ)3D_S_MxNj2d1_S16S16I16_MOW_WHD + * ****************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution */ +/* with stride = 2. Code implementation is generated during */ +/* preprocessing stage. This method can be used to generate */ +/* MxN 3D dilated convolution function and MxN 3D VQ dilated */ +/* convolution function for S16 bit input data with input stride*/ +/* equal to 1 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S16 */ +/* biasArray is signed 64, value not exceeding signed 48b */ +/* Output scale array is U16 */ +/* OutData is S16 / U16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD ******************/ +/****************** xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD ****************/ + +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj2d1), S16I16_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_I16(outTile); + XAI_CHECK_TILE4D_S16(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S64(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= 16) && \ + (XAI_TILE4D_GET_DIM2(coeffTile) <= 16), \ + XAI_ERR_KSIZE, "Kernel Width = %u and Kernel Height = %u\n \ + Kernel Width or Height should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile), XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %u and Stride along height = %u\n \ + Stride along width should be equal to stride along height.", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_STRIDE(param, 2); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %u and Dilation along height = %u\n \ + Dilation along width should be equal to dilation along height.", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32, \ + XAI_ERR_NORM, "Accumulator shift value = %u\nThe accumulator shift value should be less than 32", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "Output shift = %u\nThe output shift value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV_S16 == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "Width of Output Scale Array = %u and Number of Kernels = %u\n \ + Width of Output Scale Array should be greater than or equal to number of kernels.", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV_S16 == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kWidthU = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + int16_t* pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t* pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t* pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int16_t* pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + +#if DILATED_VQ_CONV_S16 == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + int32_t leftEdge, topEdge; + int32_t minLim, maxLim; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0; + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX; + } + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + int32_t varLen; + + xb_vecNx16 * restrict pvecIn1; + xb_vecNx16 * restrict pvecIn2; + xb_vecNx16* restrict pvecOut; + xb_vecN_2x32v* restrict phvecCoeff1, *restrict phvecCoeff2; + xb_vec2Nx8 *restrict pdvecBias64; + + xb_vec2Nx8 seq1 = IVP_ADD2NX8(IVP_SEQ2NX8(), 1); + xb_vec2Nx8 seq2 = IVP_ADD2NX8(IVP_SEQ2NX8(), 33); + seq2 = IVP_MIN2NX8(seq2, 64); + xb_vec2Nx8 dvecSel = IVP_SEL2NX8I(seq2, seq1, IVP_SELI_8B_INTERLEAVE_1_LO); + + /* Number of output elements that can be generated + * with 2 input vector loads(32 way).*/ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kWidthU) / stride) + 1; + + if (kWidthU > 12) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...64 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* vecInData1 : 1, 3, 5,...61 */ + /* vecInData2 : 2, 4, 6,...62 */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8()); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8()); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 6)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 6)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 6)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 6)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 7)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 7)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 7)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 7)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kWidthU > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel*/ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...64 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* vecInData1 : 1, 3, 5,...61 */ + /* vecInData2 : 2, 4, 6,...62 */ + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8()); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8()); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kWidthU > 4) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...64 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* vecInData1 : 1, 3, 5,...61 */ + /* vecInData2 : 2, 4, 6,...62 */ + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8()); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8()); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum12, accSum21, accSum22; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + accSum12 = accSum11; accSum22 = accSum21; + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(3rd) input row, corresponding to 2nd output row */ + + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* Re-arrange the data in the desired format */ + /* Assume input as 1,2,3,4,5,6,7...64 */ + /* After re-arrangement using DSEL operation, updated vectors would be */ + /* vecInData1 : 1, 3, 5,...61 */ + /* vecInData2 : 2, 4, 6,...62 */ + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, IVP_SEQ2NX8()); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, IVP_SEQ2NX8()); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + + IVP_DSELNX16(vecInData12, vecInData11, vecInData12, vecInData11, dvecSel); + IVP_DSELNX16(vecInData22, vecInData21, vecInData22, vecInData21, dvecSel); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecInData12, vecInData11, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + IVP_MULPAN16XR16(accSum12, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum22, vecInData22, vecInData21, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + } /* for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1L, vecOut2L, vecOut3L, vecOut4L; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1L, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2L, accSum12, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3L, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4L, accSum22, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut3L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut4L, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************** + * xaiConvolved(VQ)3D_S_MxNj4d1_S16S16I16_MOW_WHD + * ****************************************************************************/ +/******************************************************************************/ +/* Description : P6 optimized generic implementation for MxN 3D convolution */ +/* with stride = 4. Code implementation is generated during */ +/* preprocessing stage. This method can be used to generate */ +/* MxN 3D dilated convolution function and MxN 3D VQ dilated */ +/* convolution function for S16 bit input data with input stride*/ +/* equal to 4. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : CoeffData is S16 */ +/* biasArray is signed 64, value not exceeding signed 48b */ +/* Output scale array is U16 */ +/* OutData is S16 / U16 */ +/* Kernel Size is MxNxDxN */ +/* Input and Output are in WHD format */ +/* Coeff is in WHDN format */ +/******************************************************************************/ + +/****************** xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD *********************/ +/****************** xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD *******************/ +XAI_ERR_TYPE MAKE_NAME(MAKE_NAME_VQ(xaiConvolved, 3D_S_MxNj4d1), S16I16_MOW_WHD) MAKE_ARGUMENTS(inTile, coeffTile, biasArray, outTile, param) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE3D_I16(outTile); + XAI_CHECK_TILE4D_S16(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S64(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= 16) && \ + (XAI_TILE4D_GET_DIM2(coeffTile) <= 16), \ + XAI_ERR_KSIZE, "Kernel Width = %u and Kernel Height = %u\n \ + Kernel Width or Height should be less than or equal to 16", \ + XAI_TILE4D_GET_DIM1(coeffTile), XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_STRIDEX(param) == XAI_CNN_CONV_GET_STRIDEY(param)), \ + XAI_ERR_BADARG, "Stride along width = %u and Stride along height = %u\n \ + Stride along width should be equal to stride along height.", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_STRIDE(param, 4); + XAI_CHECK_DILATION(param, 1); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "Dilation along width = %u and Dilation along height = %u\n \ + Dilation along width should be equal to dilation along height.", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + XAI_CHECK_CONSISTENCY_MOW_WHD(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32, \ + XAI_ERR_NORM, "Accumulator shift value = %u\nThe accumulator shift value should be less than 32", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "Output shift = %u\nThe output shift value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#if DILATED_VQ_CONV_S16 == VQ_TRUE + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), \ + XAI_ERR_DATASIZE, "Width of Output Scale Array = %u and Number of Kernels = %u\n \ + Width of Output Scale Array should be greater than or equal to number of kernels.", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_CHECK_ERROR((((uintptr_t) (XAI_ARRAY_GET_DATA_PTR(outputScaleArray)) & \ + 0x1) == 0), XAI_ERR_NORM, "The output scale array is not aligned to 2 byte boundary"); +#endif + } +#if DILATED_VQ_CONV_S16 == VQ_FALSE + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Getting parameters from the tile structures */ + const int32_t inW = XAI_TILE3D_GET_DIM1(inTile) + \ + XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile); + const int32_t outW = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM2(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM3(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM3(outTile); + + /* Kernel Size (WHDN)*/ + const int32_t kWidthU = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM2(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t stride = XAI_CNN_CONV_GET_STRIDE(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + + /* Pitches of Coefficient Data (WHDN) */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (WHD) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (WHD) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Data Pointers of input, output, coefficient and bias data */ + int16_t* pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t* pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t* pBiasData64 = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + int16_t* pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + +#if DILATED_VQ_CONV_S16 == VQ_TRUE + uint16_t* restrict pOutScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + int32_t leftEdge, topEdge; + int32_t minLim, maxLim; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch1 + leftEdge)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0; + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX; + } + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, ky; + int32_t varLen; + + xb_vecNx16 * restrict pvecIn1; + xb_vecNx16 * restrict pvecIn2; + xb_vecNx16* restrict pvecOut; + xb_vecN_2x32v* restrict phvecCoeff1, *restrict phvecCoeff2; + xb_vec2Nx8 *restrict pdvecBias64; + + /* Number of output elements that can be generated + * with 2 input vector loads(32 way).*/ + const int32_t vectorizationWidth = (((2 * XCHAL_IVPN_SIMD_WIDTH) - kWidthU) / stride) + 1; + + if (kWidthU > 12) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum21; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecData1, vecData2, vecData3, vecData4; + xb_vecNx16 vecData5, vecData6, vecData7, vecData8; + xb_vecNx16 vecData9, vecData10, vecData11, vecData12; + xb_vecNx16 vecData13, vecData14, vecData15, vecData16; + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(5th) input row, corresponding to 2nd output row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,...63, and the 2nd input row is + * 64,65,66,67.........127, Data should be arranged as + * + * vecData1 : 0, 4, 8,...56,60, 64,68,72,...120,124 + * vecData2 : 1, 5, 9,...57,61, 65,69,73,...121,125 + * vecData3 : 2, 6,10,...58,62, 66,70,74,...122,126 + * vecData4 : 3, 7,11,...59,63, 67,71,75,...123,127 + * + * Lower half of the vectors contain data from 1st output row and + * upper half of the vectors contain data from 2nd output row. + */ + + IVP_DSELNX16(vecData2, vecData1, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SEQ2NX8()); + IVP_DSELNX16(vecData4, vecData3, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SEQ2NX8()); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + /* multiples loaded input data with 2nd two coeff */ + IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + vecData5 = IVP_SELNX16I(0, vecData1, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData6 = IVP_SELNX16I(0, vecData2, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData7 = IVP_SELNX16I(0, vecData3, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData8 = IVP_SELNX16I(0, vecData4, IVP_SELI_16B_ROTATE_RIGHT_1); + + /* multiples loaded input data with 3rd two coeff */ + IVP_MULPAN16XR16(accSum11, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + /* multiples loaded input data with 4th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + vecData9 = IVP_SELNX16I(0, vecData5, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData10 = IVP_SELNX16I(0, vecData6, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData11 = IVP_SELNX16I(0, vecData7, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData12 = IVP_SELNX16I(0, vecData8, IVP_SELI_16B_ROTATE_RIGHT_1); + + /* multiples loaded input data with 5th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum21, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + /* multiples loaded input data with 6th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum21, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + vecData13 = IVP_SELNX16I(0, vecData9, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData14 = IVP_SELNX16I(0, vecData10, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData15 = IVP_SELNX16I(0, vecData11, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData16 = IVP_SELNX16I(0, vecData12, IVP_SELI_16B_ROTATE_RIGHT_1); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData14, vecData13, IVP_EXTRN_2X32(hvecCoeffData11, 6)); + IVP_MULPAN16XR16(accSum21, vecData14, vecData13, IVP_EXTRN_2X32(hvecCoeffData21, 6)); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData16, vecData15, IVP_EXTRN_2X32(hvecCoeffData11, 7)); + IVP_MULPAN16XR16(accSum21, vecData16, vecData15, IVP_EXTRN_2X32(hvecCoeffData21, 7)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H; + xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kWidthU > 8) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum21; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecData1, vecData2, vecData3, vecData4; + xb_vecNx16 vecData5, vecData6, vecData7, vecData8; + xb_vecNx16 vecData9, vecData10, vecData11, vecData12; + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(5th) input row, corresponding to 2nd output row */ + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,...63, and the 2nd input row is + * 64,65,66,67.........127, Data should be arranged as + * + * vecData1 : 0, 4, 8,...56,60, 64,68,72,...120,124 + * vecData2 : 1, 5, 9,...57,61, 65,69,73,...121,125 + * vecData3 : 2, 6,10,...58,62, 66,70,74,...122,126 + * vecData4 : 3, 7,11,...59,63, 67,71,75,...123,127 + * + * Lower half of the vectors contain data from 1st output row and + * upper half of the vectors contain data from 2nd output row. + */ + + IVP_DSELNX16(vecData2, vecData1, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SEQ2NX8()); + IVP_DSELNX16(vecData4, vecData3, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SEQ2NX8()); + + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + /* multiples loaded input data with 2nd two coeff */ + IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + vecData5 = IVP_SELNX16I(0, vecData1, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData6 = IVP_SELNX16I(0, vecData2, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData7 = IVP_SELNX16I(0, vecData3, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData8 = IVP_SELNX16I(0, vecData4, IVP_SELI_16B_ROTATE_RIGHT_1); + + /* multiples loaded input data with 3rd two coeff */ + IVP_MULPAN16XR16(accSum11, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + /* multiples loaded input data with 4th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + + /* right rotate the input vectors by 2 + * in order to multiply with next column of + * coeff in the next iteration + */ + vecData9 = IVP_SELNX16I(0, vecData5, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData10 = IVP_SELNX16I(0, vecData6, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData11 = IVP_SELNX16I(0, vecData7, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData12 = IVP_SELNX16I(0, vecData8, IVP_SELI_16B_ROTATE_RIGHT_1); + + /* multiples loaded input data with 5th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData11, 4)); + IVP_MULPAN16XR16(accSum21, vecData10, vecData9, IVP_EXTRN_2X32(hvecCoeffData21, 4)); + /* multiples loaded input data with 6th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData11, 5)); + IVP_MULPAN16XR16(accSum21, vecData12, vecData11, IVP_EXTRN_2X32(hvecCoeffData21, 5)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H; + xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else if (kWidthU > 4) + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum21; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecData1, vecData2, vecData3, vecData4; + xb_vecNx16 vecData5, vecData6, vecData7, vecData8; + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(5th) input row, corresponding to 2nd output row */ + + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,...63, and the 2nd input row is + * 64,65,66,67.........127, Data should be arranged as + * + * vecData1 : 0, 4, 8,...56,60, 64,68,72,...120,124 + * vecData2 : 1, 5, 9,...57,61, 65,69,73,...121,125 + * vecData3 : 2, 6,10,...58,62, 66,70,74,...122,126 + * vecData4 : 3, 7,11,...59,63, 67,71,75,...123,127 + * + * Lower half of the vectors contain data from 1st output row and + * upper half of the vectors contain data from 2nd output row. + */ + + IVP_DSELNX16(vecData2, vecData1, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SEQ2NX8()); + IVP_DSELNX16(vecData4, vecData3, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SEQ2NX8()); + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + /* multiples loaded input data with 2nd two coeff */ + IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + + /* right rotate the input vectors by 2 elements + * in order to multiply with next column of + * coeff in the next iteration + */ + vecData5 = IVP_SELNX16I(0, vecData1, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData6 = IVP_SELNX16I(0, vecData2, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData7 = IVP_SELNX16I(0, vecData3, IVP_SELI_16B_ROTATE_RIGHT_1); + vecData8 = IVP_SELNX16I(0, vecData4, IVP_SELI_16B_ROTATE_RIGHT_1); + + /* multiples loaded input data with 3rd two coeff */ + IVP_MULPAN16XR16(accSum11, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData11, 2)); + IVP_MULPAN16XR16(accSum21, vecData6, vecData5, IVP_EXTRN_2X32(hvecCoeffData21, 2)); + /* multiples loaded input data with 4th two coeff */ + IVP_MULPAN16XR16(accSum11, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData11, 3)); + IVP_MULPAN16XR16(accSum21, vecData8, vecData7, IVP_EXTRN_2X32(hvecCoeffData21, 3)); + } /* end of for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H; + xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + else + { + /* loop across output channels is unrolled twice + * to produce two output channels in 1 iteration. + * Also loop across output height by 2 , thereby + * producing 4 output vectors simultaneously. + */ + for (x = 0; x < outW; x += vectorizationWidth) /* Loop across Output width */ + { + /* out of bound flag */ + int32_t flag = XT_SALT(XCHAL_IVPN_SIMD_WIDTH, inW - stride * x); + + for (y = 0; y < outH; y += 2) /* Loop across Output height */ + { + /* In order to handle odd output height */ + int32_t enable2ndRow = XT_SALT(y, outH - 1); + /* initialize output data pointer */ + int16_t *pOutput = &pOutData[(y * outDataPitch1 + x)]; + + /* initialize input data pointer */ + int16_t *pInput = &pInData[inDataPitch1 * stride * (y) + stride * (x)]; + + /* initialize coeff and bias data pointer*/ + int16_t *pCoeff = &pCoeffData[0]; + pdvecBias64 = (xb_vec2Nx8 *) pBiasData64; + valign vaBias = IVP_LA2NX8_PP(pdvecBias64); + + for (outCh = 0; outCh < numOutCh; outCh += 2) /* Loop across Output depth */ + { + /* handles odd output channel */ + int32_t enable2ndCh = XT_SALT(outCh, numOutCh - 1); + + /* wide vectors(accumulators) initialized with bias */ + xb_vecNx48 accSum11, accSum21; + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum11, 1); + ACC_INIT_BIAS64_MOW_ONEACC(pdvecBias64, vaBias, accSum21, enable2ndCh); + + /* priming of coeff load is done outside the innermost loop*/ + phvecCoeff1 = (xb_vecN_2x32v *) (pCoeff); + valign vaCoeffData1; vaCoeffData1 = IVP_LAN_2X32_PP(phvecCoeff1); + + phvecCoeff2 = (xb_vecN_2x32v *) (pCoeff + coeffPitch3 * enable2ndCh); + valign vaCoeffData2; vaCoeffData2 = IVP_LAN_2X32_PP(phvecCoeff2); + + for (inCh = 0; inCh < numInCh; inCh++) /* Loop across input channels */ + { + /* variable declarations for input and coeff vectors */ + xb_vecN_2x32v hvecCoeffData11; + xb_vecN_2x32v hvecCoeffData21; + + /* vecInData11 refers to 1st input row, first 32(or lesser) elements + * and vecInData12 refers to next few left out elements of the same row + * required to compute one 32 way output vector(To compute one 32 way + * output vector, we require 32 + edge1 + edge2 number of input elements) + */ + xb_vecNx16 vecData1, vecData2, vecData3, vecData4; + xb_vecNx16 vecInData11, vecInData12; + xb_vecNx16 vecInData21, vecInData22; + + pvecIn1 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2); + pvecIn2 = (xb_vecNx16 *) (pInput + inCh * inDataPitch2 + \ + stride * inDataPitch1 * enable2ndRow); + + for (ky = 0; ky < kHeightU; ky++) /* Loop across kernel height */ + { + /* loads 1st input row */ + valign vaInData = IVP_LANX16_PP(pvecIn1); + IVP_LANX16_XP(vecInData11, vaInData, pvecIn1, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData12, vaInData, pvecIn1, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* loads Next(5th) input row, corresponding to 2nd output row */ + + vaInData = IVP_LANX16_PP(pvecIn2); + IVP_LANX16_XP(vecInData21, vaInData, pvecIn2, 2 * XCHAL_IVPN_SIMD_WIDTH * flag); + IVP_LANX16_XP(vecInData22, vaInData, pvecIn2, 2 * (inDataPitch1 - XCHAL_IVPN_SIMD_WIDTH * flag)); + + /* 32 elements from 1st row and 32 elements from 2nd row are concatenated here + * If 1st input row is 0,1,2,3,...63, and the 2nd input row is + * 64,65,66,67.........127, Data should be arranged as + * + * vecData1 : 0, 4, 8,...56,60, 64,68,72,...120,124 + * vecData2 : 1, 5, 9,...57,61, 65,69,73,...121,125 + * vecData3 : 2, 6,10,...58,62, 66,70,74,...122,126 + * vecData4 : 3, 7,11,...59,63, 67,71,75,...123,127 + * + * Lower half of the vectors contain data from 1st output row and + * upper half of the vectors contain data from 2nd output row. + */ + + IVP_DSELNX16(vecData2, vecData1, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_0), + IVP_SEQ2NX8()); + IVP_DSELNX16(vecData4, vecData3, + IVP_SELNX16I(vecInData22, vecInData21, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SELNX16I(vecInData12, vecInData11, IVP_SELI_16B_EXTRACT_2_OF_4_OFF_2), + IVP_SEQ2NX8()); + /* load 1 row of coeff for 1st output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData11, vaCoeffData1, phvecCoeff1, coeffPitch1 * 2); + + /* load 1 row of coeff for 2nd output channel */ + IVP_LAVN_2X32_XP(hvecCoeffData21, vaCoeffData2, phvecCoeff2, coeffPitch1 * 2); + + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData11, 0)); + IVP_MULPAN16XR16(accSum21, vecData2, vecData1, IVP_EXTRN_2X32(hvecCoeffData21, 0)); + /* multiples loaded input data with first two coeff */ + IVP_MULPAN16XR16(accSum11, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData11, 1)); + IVP_MULPAN16XR16(accSum21, vecData4, vecData3, IVP_EXTRN_2X32(hvecCoeffData21, 1)); + } /* for (ky = 0; ky < kHeightU; ky++)*/ + } /* end of for (inCh = 0; inCh < numInCh; inCh++)*/ + + /* Pack, Output Scale, Output Shift and clamping */ + xb_vecNx16 vecOut1Ch, vecOut1L, vecOut1H; + xb_vecNx16 vecOut2Ch, vecOut2L, vecOut2H; +#if DILATED_VQ_CONV_S16 == VQ_TRUE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + pOutScaleData[outCh], outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + pOutScaleData[outCh + enable2ndCh], outShiftU, minLim, maxLim); +#elif DILATED_VQ_CONV_S16 == VQ_FALSE + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1Ch, accSum11, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2Ch, accSum21, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim); +#endif + /* variable store count */ + varLen = XT_MIN(outW - x, vectorizationWidth); + + vecOut1L = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut2L = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_LO_HALVES); + vecOut1H = IVP_SELNX16I(0, vecOut1Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + vecOut2H = IVP_SELNX16I(0, vecOut2Ch, IVP_SELI_16B_EXTRACT_HI_HALVES); + /* Storing the first row , first depth output */ + pvecOut = (xb_vecNx16 *) (pOutput); + valign vaOutData = IVP_ZALIGN(); + IVP_SAVNX16_XP(vecOut1L, vaOutData, pvecOut, 2 * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the first row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndCh * outDataPitch2); + IVP_SAVNX16_XP(vecOut2L, vaOutData, pvecOut, 2 * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 1st depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + enable2ndRow * outDataPitch1); + IVP_SAVNX16_XP(vecOut1H, vaOutData, pvecOut, 2 * enable2ndRow * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + /* Storing the 2nd row , 2nd depth output */ + pvecOut = (xb_vecNx16 *) (pOutput + (enable2ndCh * outDataPitch2 + \ + enable2ndRow * outDataPitch1)); + IVP_SAVNX16_XP(vecOut2H, vaOutData, pvecOut, 2 * \ + enable2ndRow * enable2ndCh * varLen); + IVP_SAPOSNX16_FP(vaOutData, pvecOut); + + pOutput += 2 * outDataPitch2; + pCoeff += 2 * coeffPitch3; + } /* end of (outCh = 0; outCh < numOutCh; outCh += 2)*/ + } /* end of for (y = 0; y < outH; y += 2)*/ + } /* end of for (x = 0; x < outW; x += vectorizationWidth)*/ + } + return(XAI_ERROR_STATUS()); +} +#endif /*if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c new file mode 100644 index 00000000000..9915e6649d6 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef DILATED_SO_VQ_CONV + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dilated_conv_SO.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dilated_conv_SO.h" +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h new file mode 100644 index 00000000000..cc937dc2171 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_SO.h @@ -0,0 +1,1027 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT, suffix) name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## suffix + +#if INPUT_DATA_TYPE == UNSIGNED8BIT + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, U8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8U +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8U_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8U_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8U_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8U_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8U_XP +#define MORPH_OP_MULA IVP_MULUSA2N8XR16 +#define MORPH_OP_MULPA IVP_MULUSPA2NX8 + + +#elif INPUT_DATA_TYPE == SIGNED8BIT + +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_2Nx8 +#undef MORPH_OP_PRIME_2Nx8 +#undef MORPH_OP_ALIGN_LOAD_2Nx8 +#undef MORPH_OP_LOAD_2Nx8_IP +#undef MORPH_OP_LOAD_2Nx8_VARIABLE +#undef MORPH_OP_LOAD_2Nx8 +#undef MORPH_OP_MULA +#undef MORPH_OP_MULPA + + +#define MAKE_NAME(name, suffix) MAKE_NAME_IMPL(name, S8, suffix) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_2Nx8 xb_vec2Nx8 +#define MORPH_OP_PRIME_2Nx8 IVP_LA2NX8_PP +#define MORPH_OP_ALIGN_LOAD_2Nx8 IVP_LV2NX8_XP +#define MORPH_OP_LOAD_2Nx8 IVP_LA2NX8_XP +#define MORPH_OP_LOAD_2Nx8_IP IVP_LA2NX8_IP +#define MORPH_OP_LOAD_2Nx8_VARIABLE IVP_LAV2NX8_XP +#define MORPH_OP_MULA IVP_MULA2N8XR16 +#define MORPH_OP_MULPA IVP_MULPA2NX8 +#endif + +/****************************************************************************************** +* SO(Single output) variants +******************************************************************************************/ +/* convolved3D_S_MxN_S8S8IXCa2_SO_DWH_INPUTNOEDGE */ +/* convolved3D_S_MxN_U8S8IXCa2_SO_DWH_INPUTNOEDGE */ +/***********************************************************************/ +/* Description : P6 Optimized implementation of 3D convolution in SO */ +/* for cases where */ +/* . there are no edges along depth for input tile */ +/* and coeff tile */ +/* . dilation = 1 */ +/* . dim2pitch of coeff tile is a multiple of 64 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is S8/U8 */ +/* CoeffData is S8 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is close to that of Input Size. */ +/* Input and Output is in DWH format. */ +/* Coeff is in DWHN format. */ +/* dim1Size of Input Tile is equal to dim1Pitch of Input */ +/* Tile. */ +/***********************************************************************/ +#ifdef DILATED_SO_VQ_CONV +static _XAI_INLINE_ void MAKE_NAME(convolvedVQ3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * param + ) +#else +static _XAI_INLINE_ void MAKE_NAME(convolved3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const int32_t kWidthU = XAI_TILE4D_GET_DIM2(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); +#ifdef DILATED_SO_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t dilation = XAI_CNN_CONV_GET_DILATION(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR *pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + /* Pitches of Coefficient Data (DWHN) in dim2 and dim3 */ + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t dilatedKWidthU = dilation * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilation * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + int32_t outCh, k, x, y, ky; + + MORPH_IDT_2Nx8* restrict pdvecData; + MORPH_IDT_2Nx8* restrict pdvecData1; + MORPH_IDT_2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecCoeff3; + xb_vec2Nx8* restrict pdvecCoeff4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecBias; + + valign vaOutData = IVP_ZALIGN(); + if (numOutCh * outW * outH == 1 && kHeightU * kWidthU == 1 && (numInCh & (4 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) + { +#ifdef DILATED_SO_VQ_CONV + const uint16_t outScale = ((int16_t *) pOutScaleData)[0]; +#endif + + /* Initialize Accumulator */ + xb_vec2Nx24 daccSum1 = 0; + + /* Input, Output and Coefficient Pointers */ + int8_t *pOut = pOutData; + MORPH_IDT_SCALAR * pIn = pInData; + int8_t *pCoeff1 = pCoeffData; + + pdvecData = (MORPH_IDT_2Nx8 *) (pIn); + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1); + + /* Priming Load for Input Data */ + valign vaData = MORPH_OP_PRIME_2Nx8(pdvecData); + + /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */ + for (k = 0; k < numInCh; k += 4 * XCHAL_IVPN_SIMD_WIDTH) + { + /* Input Data Load */ + MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData); + MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_IP(dvecCoeff11, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_IP(dvecCoeff12, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + } + + /* Reduction Addition and Bias Addition */ + xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \ + IVP_CVT32S2NX24HL(daccSum1)); + xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \ + IVP_CVT32S2NX24LL(daccSum1)); + int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + + sum1 += pBiasData[0]; + xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum1; + + /* Truncate to 24-bit values */ + daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut); + + xb_vecNx16 outData = IVP_PACKVR2NX24_0(daccSum1, packShiftAccU); + xb_vecNx48 m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScale, outData); + outData = IVP_PACKVRNX48(m_wvec, outShiftU); + outData = IVP_MAXNX16(IVP_MINNX16(outData, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); + + /* Save the output values */ + pdvecOut = (xb_vec2Nx8 *) (pOut); + IVP_SAV2NX8_XP(IVP_MOV2NX8_FROMNX16(outData), vaOutData, pdvecOut, bytesPerPixel); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else + { + /* Output Channels Loop is unrolled by 4 */ + for (outCh = 0; outCh < numOutCh - 3; outCh += 4) /* Output Channels Loop */ + { +#ifdef DILATED_SO_VQ_CONV + xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd; + valign vascale; + //Load output scale values + vascale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 8); + outScaleDataEven = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0); + outScaleDataOdd = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1); +#endif + for (y = 0; y < outH; y++) /* Output Height Loop */ + { + for (x = 0; x < outW; x++) /* Output Width Loop */ + { + /* Initialize Accumulator */ + xb_vec2Nx24 daccSum1 = 0; + xb_vec2Nx24 daccSum2 = 0; + xb_vec2Nx24 daccSum3 = 0; + xb_vec2Nx24 daccSum4 = 0; + + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + /* Input and Coefficient Pointers */ + MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + (y * strideY) * inDataPitch2); + int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3); + int8_t *pCoeff2 = (pCoeffData + (outCh + 1) * coeffPitch3); + int8_t *pCoeff3 = (pCoeffData + (outCh + 2) * coeffPitch3); + int8_t *pCoeff4 = (pCoeffData + (outCh + 3) * coeffPitch3); + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */ + { + pdvecData1 = (MORPH_IDT_2Nx8 *) (pIn); + pdvecData2 = (MORPH_IDT_2Nx8 *) (pIn + 2 * XCHAL_IVPN_SIMD_WIDTH); + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff2); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff3); + pdvecCoeff4 = (xb_vec2Nx8 *) (pCoeff4); + + + /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */ + for (k = 0; k < kWidthU * numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; k += 4 * XCHAL_IVPN_SIMD_WIDTH) + { + /* Input Data Load */ + valign vaData1 = MORPH_OP_PRIME_2Nx8(pdvecData1); + valign vaData2 = MORPH_OP_PRIME_2Nx8(pdvecData2); + MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8(dvecData1, vaData1, pdvecData1, 4 * XCHAL_IVPN_SIMD_WIDTH); + MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8(dvecData2, vaData2, pdvecData2, 4 * XCHAL_IVPN_SIMD_WIDTH); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_IP(dvecCoeff11, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_IP(dvecCoeff12, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_IP(dvecCoeff21, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_IP(dvecCoeff22, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_IP(dvecCoeff31, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_IP(dvecCoeff32, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff41; IVP_LV2NX8_IP(dvecCoeff41, pdvecCoeff4, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff42; IVP_LV2NX8_IP(dvecCoeff42, pdvecCoeff4, 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21); + MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31); + MORPH_OP_MULPA(daccSum4, dvecData2, dvecCoeff42, dvecData1, dvecCoeff41); + } + /* Corner case handling if numInCh is not a multiple of 4 * XCHAL_IVPN_SIMD_WIDTH */ + + int32_t remK = kWidthU * numInCh - k; + /* remLoad is set to 1 if kWidthU * numInCh - k is greater than 64*/ + int32_t remLoad = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, kWidthU * numInCh - k); + + /* Input Data Load */ + valign vaData1 = MORPH_OP_PRIME_2Nx8(pdvecData1); + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData1, pdvecData1, remK); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData1, pdvecData1, remK - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_XP(dvecCoeff11, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_XP(dvecCoeff12, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_XP(dvecCoeff21, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_XP(dvecCoeff22, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_XP(dvecCoeff31, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_XP(dvecCoeff32, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff41; IVP_LV2NX8_XP(dvecCoeff41, pdvecCoeff4, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff42; IVP_LV2NX8_XP(dvecCoeff42, pdvecCoeff4, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21); + MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31); + MORPH_OP_MULPA(daccSum4, dvecData2, dvecCoeff42, dvecData1, dvecCoeff41); + + /* Update Pointer*/ + pIn += inDataPitch2; + pCoeff1 += coeffPitch2; + pCoeff2 += coeffPitch2; + pCoeff3 += coeffPitch2; + pCoeff4 += coeffPitch2; + } /* End Kernel Height Loop */ + + /* Reduction Addition and Bias Addition */ + xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \ + IVP_CVT32S2NX24HL(daccSum1)); + xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \ + IVP_CVT32S2NX24LL(daccSum1)); + int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Reduction Addition and Bias Addition */ + hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), IVP_CVT32S2NX24HL(daccSum2)); + hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), IVP_CVT32S2NX24LL(daccSum2)); + int32_t sum2 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Reduction Addition and Bias Addition */ + hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), IVP_CVT32S2NX24HL(daccSum3)); + hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), IVP_CVT32S2NX24LL(daccSum3)); + int32_t sum3 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Reduction Addition and Bias Addition */ + hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), IVP_CVT32S2NX24HL(daccSum4)); + hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), IVP_CVT32S2NX24LL(daccSum4)); + int32_t sum4 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Moving all the scalar sums to a 32-bit vector */ + xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum4; + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum3, hvecOut, IVP_LTRN_2I(3)); + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum2, hvecOut, IVP_LTRN_2I(2)); + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum1, hvecOut, IVP_LTRN_2I(1)); + + /* Load bias values corresponding to two outChannels */ + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + valign vaBias = IVP_LAN_2X32_PP(phvecBias); + xb_vecN_2x32v hvecBias; IVP_LAVN_2X32_XP(hvecBias, vaBias, phvecBias, 16); + hvecOut = IVP_ADDN_2X32(hvecOut, hvecBias); + + /* Truncate to 24-bit values */ + daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut); + + /* Pack, Scale, Shift and Clamp the accumulator output */ + xb_vec2Nx8 dvecOutData0L, dvecOutData0H; +#ifdef DILATED_SO_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Save the output values */ + pdvecOut = (xb_vec2Nx8 *) (pOut); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, 4 * bytesPerPixel); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End Output Width Loop */ + } /* End Output Height Loop */ + } /* End Output Channels Loop */ + + /* Corner case handling if Number of Output Channels is not a multiple of 4 */ + if (outCh < numOutCh) + { +#ifdef DILATED_SO_VQ_CONV + xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd; + valign vascale; + //Load output scale values + vascale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 6); + outScaleDataEven = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0); + outScaleDataOdd = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1); +#endif + + int32_t remOutCh = numOutCh - outCh; + for (y = 0; y < outH; y++) + { + for (x = 0; x < outW; x++) + { + /* Initialize Accumulator */ + xb_vec2Nx24 daccSum1 = 0; + xb_vec2Nx24 daccSum2 = 0; + xb_vec2Nx24 daccSum3 = 0; + + /* Input, Output and Coefficient Pointers */ + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + \ + (y * strideY) * inDataPitch2); + int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3); + int8_t *pCoeff2 = (pCoeffData + (outCh + XT_MIN(1, remOutCh - 1)) * coeffPitch3); + int8_t *pCoeff3 = (pCoeffData + (outCh + XT_MIN(2, remOutCh - 1)) * coeffPitch3); + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */ + { + pdvecData = (MORPH_IDT_2Nx8 *) (pIn); + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff2); + pdvecCoeff3 = (xb_vec2Nx8 *) (pCoeff3); + + /* Priming Load for Input Data */ + valign vaData = MORPH_OP_PRIME_2Nx8(pdvecData); + + /* Multiplying and Accumulating 128 bytes at a time using PMULs */ + for (k = 0; k < kWidthU * numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; k += 4 * XCHAL_IVPN_SIMD_WIDTH) + { + /* Input Data Load */ + MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData); + MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_IP(dvecCoeff11, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_IP(dvecCoeff12, pdvecCoeff1, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_IP(dvecCoeff21, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_IP(dvecCoeff22, pdvecCoeff2, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_IP(dvecCoeff31, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_IP(dvecCoeff32, pdvecCoeff3, 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21); + MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31); + } + int32_t remK = kWidthU * numInCh - k; + /* remLoad is set to 1 if kWidthU * numInCh - k is greater than 64*/ + int32_t remLoad = XT_SALT(2 * XCHAL_IVPN_SIMD_WIDTH, kWidthU * numInCh - k); + + /* Input Data Load */ + xb_vec2Nx8U dvecData1; IVP_LAV2NX8U_XP(dvecData1, vaData, pdvecData, remK); + xb_vec2Nx8U dvecData2; IVP_LAV2NX8U_XP(dvecData2, vaData, pdvecData, remK - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LV2NX8_XP(dvecCoeff11, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff12; IVP_LV2NX8_XP(dvecCoeff12, pdvecCoeff1, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff21; IVP_LV2NX8_XP(dvecCoeff21, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff22; IVP_LV2NX8_XP(dvecCoeff22, pdvecCoeff2, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff31; IVP_LV2NX8_XP(dvecCoeff31, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + xb_vec2Nx8 dvecCoeff32; IVP_LV2NX8_XP(dvecCoeff32, pdvecCoeff3, remLoad * 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21); + MORPH_OP_MULPA(daccSum3, dvecData2, dvecCoeff32, dvecData1, dvecCoeff31); + + /* Update Pointer*/ + pIn += inDataPitch2; + pCoeff1 += coeffPitch2; + pCoeff2 += coeffPitch2; + pCoeff3 += coeffPitch2; + } /* End Kernel Height Loop */ + /* Reduction Addition and Bias Addition */ + xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \ + IVP_CVT32S2NX24HL(daccSum1)); + xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \ + IVP_CVT32S2NX24LL(daccSum1)); + int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Reduction Addition */ + hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), IVP_CVT32S2NX24HL(daccSum2)); + hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), IVP_CVT32S2NX24LL(daccSum2)); + int32_t sum2 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Reduction Addition */ + hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), IVP_CVT32S2NX24HL(daccSum3)); + hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), IVP_CVT32S2NX24LL(daccSum3)); + int32_t sum3 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + + /* Moving all the scalar sums to a 32-bit vector */ + xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum3; + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum2, hvecOut, IVP_LTRN_2I(2)); + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum1, hvecOut, IVP_LTRN_2I(1)); + + /* Load bias values corresponding to two outChannels */ + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + valign vaBias = IVP_LAN_2X32_PP(phvecBias); + xb_vecN_2x32v hvecBias; IVP_LAVN_2X32_XP(hvecBias, vaBias, phvecBias, 4 * remOutCh); + + /* Add bias to the accumulated value*/ + hvecOut = IVP_ADDN_2X32(hvecOut, hvecBias); + + /* Truncate to 24-bit values */ + daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut); + + /* Pack, Scale, Shift and Clamp the accumulator output */ + xb_vec2Nx8 dvecOutData0L, dvecOutData0H; +#ifdef DILATED_SO_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Save the output values */ + pdvecOut = (xb_vec2Nx8 *) (pOut); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, remOutCh * bytesPerPixel); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End Output Width Loop */ + } /* End Output Height Loop */ + } /* End of if (outCh < numOutCh) */ + } /*End else*/ +} + + +/***************************************************************************/ +/* xaiConvolved(VQ)3D_S_MxN_S8_SO_DWH/xaiConvolve(VQ)3D_S_MxN_U8_SO_DWH */ +/***************************************************************************/ + +/***********************************************************************/ +/* Description : P6 Optimized implementation of 3D convolution in SO */ +/* Vectorization Approach. */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : InData is S8/U8 */ +/* CoeffData is S8 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is close to that of Input Size. */ +/* Input and Output is in DWH format. */ +/* Coeff is in DWHN format. */ +/***********************************************************************/ + +/***************** xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH *****************/ +/***************** xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH *****************/ +/****************** xaiConvolved3D_S_MxN_S8S8IX_SO_DWH ******************/ +/****************** xaiConvolved3D_S_MxN_U8S8IX_SO_DWH ******************/ + +#ifdef DILATED_SO_VQ_CONV +XAI_ERR_TYPE MAKE_NAME(xaiConvolvedVQ3D_S_MxN, S8IX_SO_DWH) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * param + ) +#else +XAI_ERR_TYPE MAKE_NAME(xaiConvolved3D_S_MxN, S8IX_SO_DWH) ( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_POINTER(param); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_EDGES_SO(inTile, coeffTile, param); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_DWHN); + XAI_CHECK_CONSISTENCY_SO_DWH(inTile, coeffTile, biasArray, outTile, param); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_SO_VQ_CONV + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM4(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM4(coeffTile)); +#endif + } +#ifndef DILATED_SO_VQ_CONV + if (XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* If + * 1) there are no edges along depth (dim1) for input and coeff and dilation = 1 + * 2) the coeff pointer is aligned to (XCHAL_IVPN_SIMD_WIDTH << 1) and dim2pitch is a multiple of (XCHAL_IVPN_SIMD_WIDTH << 1) + * Call MAKE_NAME(convolved3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) + */ + if ((XAI_TILE3D_GET_DIM1_PITCH(inTile) == XAI_TILE3D_GET_DIM1(inTile)) && + (XAI_TILE4D_GET_DIM1_PITCH(coeffTile) == XAI_TILE4D_GET_DIM1(coeffTile)) && \ + (XAI_CNN_CONV_GET_DILATIONX(param) == 1) && (XAI_CNN_CONV_GET_DILATIONY(param) == 1)) + { + if ((XAI_TILE4D_IS_PTR_ALIGNED_2NX8(coeffTile) && \ + (XAI_TILE4D_GET_DIM2_PITCH(coeffTile) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0)) + { +#ifdef DILATED_SO_VQ_CONV + MAKE_NAME(convolvedVQ3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) (inTile, + coeffTile, + biasArray, + outputScaleArray, + outTile, + param); +#else + MAKE_NAME(convolved3D_S_MxN, S8IXCa2_SO_DWH_INPUTNOEDGE) (inTile, + coeffTile, + biasArray, + outTile, + param); +#endif + return(XAI_ERROR_STATUS()); + } + } + + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const int32_t kWidthU = XAI_TILE4D_GET_DIM2(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM3(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); +#ifdef DILATED_SO_VQ_CONV + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + + /* Data Pointers of input, output, coefficient and bias data */ + MORPH_IDT_SCALAR *pInData = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + /* Pitches of Coefficient Data (DWHN) in dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + /* Move pointer to the start of the active data (including edge) */ + pInData = &pInData[-(topEdge * inDataPitch2 + leftEdge * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + int32_t outCh, inCh, x, y, ky, kx; + + MORPH_IDT_2Nx8* restrict pdvecData; + xb_vec2Nx8* restrict pdvecCoeff1; + xb_vec2Nx8* restrict pdvecCoeff2; + xb_vec2Nx8* restrict pdvecOut; + + valign vaOutData = IVP_ZALIGN(); + + /* Output Channels Loop is unrolled by 2 */ + for (outCh = 0; outCh < numOutCh - 1; outCh += 2) /* Output Channels Loop */ + { +#ifdef DILATED_SO_VQ_CONV + xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd; + valign vascale; + //Load output scale values + vascale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 4); + outScaleDataEven = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0); + outScaleDataOdd = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1); +#endif + + for (y = 0; y < outH; y++) /* Output Height Loop */ + { + for (x = 0; x < outW; x++) /* Output Width Loop */ + { + /* Initialize Accumulator */ + xb_vec2Nx24 daccSum1 = 0; + xb_vec2Nx24 daccSum2 = 0; + + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */ + { + /* Input and Coefficient Pointers */ + MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + \ + (y * strideY + ky * dilationY) * inDataPitch2); + int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3 + ky * coeffPitch2); + int8_t *pCoeff2 = (pCoeffData + (outCh + 1) * coeffPitch3 + ky * coeffPitch2); + + for (kx = 0; kx < kWidthU; kx++) /* Kernel Width Loop */ + { + pdvecData = (MORPH_IDT_2Nx8 *) (pIn); + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1); + pdvecCoeff2 = (xb_vec2Nx8 *) (pCoeff2); + + /* Priming Loads for Input and Coefficient Data */ + valign vaData = MORPH_OP_PRIME_2Nx8(pdvecData); + valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1); + valign vaCoeff2 = IVP_LA2NX8_PP(pdvecCoeff2); + + /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */ + for (inCh = 0; inCh < numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; inCh += 4 * XCHAL_IVPN_SIMD_WIDTH) + { + /* Input Data Load */ + MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData); + MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LA2NX8_IP(dvecCoeff11, vaCoeff1, pdvecCoeff1); + xb_vec2Nx8 dvecCoeff12; IVP_LA2NX8_IP(dvecCoeff12, vaCoeff1, pdvecCoeff1); + xb_vec2Nx8 dvecCoeff21; IVP_LA2NX8_IP(dvecCoeff21, vaCoeff2, pdvecCoeff2); + xb_vec2Nx8 dvecCoeff22; IVP_LA2NX8_IP(dvecCoeff22, vaCoeff2, pdvecCoeff2); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21); + } + /* Corner case handling if numInCh is not a multiple of 4 * XCHAL_IVPN_SIMD_WIDTH */ + int32_t remLength = numInCh - inCh; + + /* Input Data Load */ + MORPH_IDT_2Nx8 dvecData1; + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData1, vaData, pdvecData, remLength); + MORPH_IDT_2Nx8 dvecData2; + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData2, vaData, pdvecData, \ + remLength - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11, dvecCoeff12, dvecCoeff21, dvecCoeff22; + IVP_LAV2NX8_XP(dvecCoeff11, vaCoeff1, pdvecCoeff1, remLength); + IVP_LAV2NX8_XP(dvecCoeff12, vaCoeff1, pdvecCoeff1, remLength - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAV2NX8_XP(dvecCoeff21, vaCoeff2, pdvecCoeff2, remLength); + IVP_LAV2NX8_XP(dvecCoeff22, vaCoeff2, pdvecCoeff2, remLength - 2 * XCHAL_IVPN_SIMD_WIDTH); + + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + MORPH_OP_MULPA(daccSum2, dvecData2, dvecCoeff22, dvecData1, dvecCoeff21); + + pIn += dilationX * inDataPitch1; + pCoeff1 += coeffPitch1; + pCoeff2 += coeffPitch1; + } /* End Kernel Width Loop */ + } /* End Kernel Height Loop */ + + /* Reduction Addition and Bias Addition */ + xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \ + IVP_CVT32S2NX24HL(daccSum1)); + xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \ + IVP_CVT32S2NX24LL(daccSum1)); + int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + sum1 += pBiasData[outCh]; + + /* Reduction Addition and Bias Addition */ + hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), IVP_CVT32S2NX24HL(daccSum2)); + hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), IVP_CVT32S2NX24LL(daccSum2)); + int32_t sum2 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + sum2 += pBiasData[outCh + 1]; + + /* Moving all the scalar sums to a 32-bit vector */ + xb_vecN_2x32v hvecOut = 0; + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum2, hvecOut, IVP_LTRN_2I(2)); + hvecOut = IVP_MOVN_2X32T((xb_vecN_2x32v) sum1, hvecOut, IVP_LTRN_2I(1)); + + /* Truncate to 24-bit values */ + daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut); + + /* Pack, Scale, Shift and Clamp the accumulator output */ + xb_vec2Nx8 dvecOutData0L, dvecOutData0H; +#ifdef DILATED_SO_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Save the output values */ + pdvecOut = (xb_vec2Nx8 *) (pOut); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, 2 * bytesPerPixel); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End Output Width Loop */ + } /* End Output Height Loop */ + } /* End Output Channels Loop */ + + /* Corner case handling if Number of Output Channels is odd */ + if (outCh < numOutCh) + { +#ifdef DILATED_SO_VQ_CONV + xb_vecNx16U outScaleData, outScaleDataEven, outScaleDataOdd; + valign vascale; + //Load output scale values + vascale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleData, vascale, pOutScaleData, 2); + outScaleDataEven = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0); + outScaleDataOdd = IVP_SELNX16UI(outScaleData, + outScaleData, + IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1); +#endif + for (y = 0; y < outH; y++) + { + for (x = 0; x < outW; x++) + { + /* Initialize Accumulator */ + xb_vec2Nx24 daccSum1 = 0; + + int8_t *pOut = pOutData + (outCh + x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height Loop */ + { + /* Input and Coefficient Pointers */ + MORPH_IDT_SCALAR * pIn = (pInData + x * strideX * inDataPitch1 + \ + (y * strideY + ky * dilationY) * inDataPitch2); + int8_t *pCoeff1 = (pCoeffData + outCh * coeffPitch3 + ky * coeffPitch2); + + for (kx = 0; kx < kWidthU; kx++) /* Kernel Width Loop */ + { + pdvecData = (MORPH_IDT_2Nx8 *) (pIn); + pdvecCoeff1 = (xb_vec2Nx8 *) (pCoeff1); + + /* Priming Loads for Input and Coefficient Data */ + valign vaData = MORPH_OP_PRIME_2Nx8(pdvecData); + valign vaCoeff1 = IVP_LA2NX8_PP(pdvecCoeff1); + + /* Multiplying and Accumulating 4 * XCHAL_IVPN_SIMD_WIDTH bytes at a time using PMULs */ + for (inCh = 0; inCh < numInCh - 4 * XCHAL_IVPN_SIMD_WIDTH; inCh += 4 * XCHAL_IVPN_SIMD_WIDTH) + { + /* Input Data Load */ + MORPH_IDT_2Nx8 dvecData1; MORPH_OP_LOAD_2Nx8_IP(dvecData1, vaData, pdvecData); + MORPH_IDT_2Nx8 dvecData2; MORPH_OP_LOAD_2Nx8_IP(dvecData2, vaData, pdvecData); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11; IVP_LA2NX8_IP(dvecCoeff11, vaCoeff1, pdvecCoeff1); + xb_vec2Nx8 dvecCoeff12; IVP_LA2NX8_IP(dvecCoeff12, vaCoeff1, pdvecCoeff1); + + /* Pair Multiply and Accumulates */ + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + } + /* Corner case handling if numInCh is not a multiple of 4 * XCHAL_IVPN_SIMD_WIDTH */ + int32_t remLength = numInCh - inCh; + + /* Input Data Load */ + MORPH_IDT_2Nx8 dvecData1; + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData1, vaData, pdvecData, remLength); + MORPH_IDT_2Nx8 dvecData2; + MORPH_OP_LOAD_2Nx8_VARIABLE(dvecData2, vaData, pdvecData, \ + remLength - 2 * XCHAL_IVPN_SIMD_WIDTH); + + /* Coefficient Data Load */ + xb_vec2Nx8 dvecCoeff11, dvecCoeff12; + IVP_LAV2NX8_XP(dvecCoeff11, vaCoeff1, pdvecCoeff1, remLength); + IVP_LAV2NX8_XP(dvecCoeff12, vaCoeff1, pdvecCoeff1, remLength - 2 * XCHAL_IVPN_SIMD_WIDTH); + + MORPH_OP_MULPA(daccSum1, dvecData2, dvecCoeff12, dvecData1, dvecCoeff11); + + pIn += dilationX * inDataPitch1; + pCoeff1 += coeffPitch1; + } /* End Kernel Width Loop */ + } /* End Kernel Height Loop */ + /* Reduction Addition and Bias Addition */ + xb_vecN_2x32v hvecSumUpper = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), \ + IVP_CVT32S2NX24HL(daccSum1)); + xb_vecN_2x32v hvecSumLower = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), \ + IVP_CVT32S2NX24LL(daccSum1)); + int32_t sum1 = IVP_RADDN_2X32(IVP_ADDN_2X32(hvecSumUpper, hvecSumLower)); + sum1 += pBiasData[outCh]; + + /* Moving all the scalar sums to a 32-bit vector */ + xb_vecN_2x32v hvecOut = (xb_vecN_2x32v) sum1; + + /* Truncate to 24-bit values */ + daccSum1 = IVP_CVT24UNX32L(hvecOut, hvecOut); + + /* Pack, Scale, Shift and Clamp the accumulator output */ + xb_vec2Nx8 dvecOutData0L, dvecOutData0H; +#ifdef DILATED_SO_VQ_CONV + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOutData0L, dvecOutData0H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Save the output values */ + pdvecOut = (xb_vec2Nx8 *) (pOut); + IVP_SAV2NX8_XP(dvecOutData0L, vaOutData, pdvecOut, bytesPerPixel); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } /* End Output Width Loop */ + } /* End Output Height Loop */ + } /* End of if (outCh < numOutCh) */ + + return(XAI_ERROR_STATUS()); +} + + +/****************************** end of SO variants *****************************************/ +/*******************************************************************************************/ +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c new file mode 100644 index 00000000000..26cdd5dbddf --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV +#include "cnn_dilated_conv_MOD.h" + +/******************************* end of MOD variants ***************************************/ +/*******************************************************************************************/ +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c new file mode 100644 index 00000000000..5a8e430f81f --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOD_S16.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV_S16 + +#include "cnn_dilated_conv_MOD_S16.h" +/******************************* end of MOD variants ***************************************/ +/*******************************************************************************************/ +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c new file mode 100644 index 00000000000..35a3c00cc59 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV VQ_TRUE + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dilated_conv_MOW.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dilated_conv_MOW.h" + +#undef INPUT_DATA_TYPE +#endif //if ((XCHAL_VISION_TYPE >= 6)) + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c new file mode 100644 index 00000000000..3559031cb6e --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_MOW_S16.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV_S16 VQ_TRUE + +#include "cnn_dilated_conv_MOW_S16.h" +#endif //if ((XCHAL_VISION_TYPE >= 6)) + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c new file mode 100644 index 00000000000..6944b553462 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_SO.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) +#define DILATED_SO_VQ_CONV + +#define INPUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_dilated_conv_SO.h" +#undef INPUT_DATA_TYPE + +#define INPUT_DATA_TYPE SIGNED8BIT +#include "cnn_dilated_conv_SO.h" + +#undef INPUT_DATA_TYPE +#endif //if ((XCHAL_VISION_TYPE >= 6)) + + diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c new file mode 100644 index 00000000000..58390cbc189 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV_PARTIAL +#include "cnn_dilated_conv_partial_MOD.h" +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c new file mode 100644 index 00000000000..de61d850b56 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_VQ_partial_MOD_S16.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define DILATED_VQ_CONV_PARTIAL +#include "cnn_dilated_conv_partial_MOD_S16.h" +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c new file mode 100644 index 00000000000..3031ade2233 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef DILATED_VQ_CONV_PARTIAL +#include "cnn_dilated_conv_partial_MOD.h" +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h new file mode 100644 index 00000000000..fed76eb2064 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD.h @@ -0,0 +1,7858 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + + +#if ((XCHAL_VISION_TYPE >= 6)) + + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1; + xb_vec2Nx8 dvecData2; + xb_vec2Nx8 dvecData3; + xb_vec2Nx8 dvecData4; + + dvecData1 = IVP_SUB2NX8U(dvecInp1, 128); + dvecData2 = IVP_SUB2NX8U(dvecInp2, 128); + dvecData3 = IVP_SUB2NX8U(dvecInp3, 128); + dvecData4 = IVP_SUB2NX8U(dvecInp4, 128); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + /* Corner case handling as numIter is not a multiple of 4 */ + if (k < numIter) + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End if( k < numIter)*/ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1; + xb_vec2Nx8 dvecData2; + xb_vec2Nx8 dvecData3; + xb_vec2Nx8 dvecData4; + + dvecData1 = IVP_SUB2NX8U(dvecInp1, 128); + dvecData2 = IVP_SUB2NX8U(dvecInp2, 128); + dvecData3 = IVP_SUB2NX8U(dvecInp3, 128); + dvecData4 = IVP_SUB2NX8U(dvecInp4, 128); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Input Channels */ + /* Corner case handling as numIter is not a multiple of 4 */ + if (k < numIter) + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + + IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End if( k < numIter)*/ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1; + xb_vec2Nx8 dvecData2; + xb_vec2Nx8 dvecData3; + xb_vec2Nx8 dvecData4; + + dvecData1 = IVP_SUB2NX8U(dvecInp1, 128); + dvecData2 = IVP_SUB2NX8U(dvecInp2, 128); + dvecData3 = IVP_SUB2NX8U(dvecInp3, 128); + dvecData4 = IVP_SUB2NX8U(dvecInp4, 128); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Input Channels */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + + IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k, j; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + xb_vecN_2x32v hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH; + xb_vecN_2x32v hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH; + xb_vecN_2x32v hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH; + xb_vecN_2x32v hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataL, outScaleDataH; + /*Load output scale values*/ + valign vaScale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleDataL, vaScale, pOutScaleData, 2 * remainingOutCh); + IVP_LAVNX16_XP(outScaleDataH, vaScale, pOutScaleData, 2 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + hvecSum1LL = hvecSum1LH = hvecSum1HL = hvecSum1HH = 0; + hvecSum2LL = hvecSum2LH = hvecSum2HL = hvecSum2HH = 0; + hvecSum3LL = hvecSum3LH = hvecSum3HL = hvecSum3HH = 0; + hvecSum4LL = hvecSum4LH = hvecSum4HL = hvecSum4HH = 0; + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + /* (Input Channels * kWidth) loops combined */ + for (j = 0; j < kWidthU * numInCh; j += 508) /* Emulation: To avoid 24 bit overflow 2^23-1 / 128 / 128 = 511.99 */ + { + xb_vec2Nx24 daccSum1 = 0, daccSum2 = 0, daccSum3 = 0, daccSum4 = 0; + int32_t numIter = XT_MIN(508, kWidthU * numInCh - j); +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End for (k = 0; k < row; k += 4) */ + + hvecSum1LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum1), hvecSum1LL); + hvecSum1LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), hvecSum1LH); + hvecSum1HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum1), hvecSum1HL); + hvecSum1HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), hvecSum1HH); + + hvecSum2LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum2), hvecSum2LL); + hvecSum2LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), hvecSum2LH); + hvecSum2HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum2), hvecSum2HL); + hvecSum2HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), hvecSum2HH); + + hvecSum3LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum3), hvecSum3LL); + hvecSum3LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), hvecSum3LH); + hvecSum3HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum3), hvecSum3HL); + hvecSum3HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), hvecSum3HH); + + hvecSum4LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum4), hvecSum4LL); + hvecSum4LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), hvecSum4LH); + hvecSum4HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum4), hvecSum4HL); + hvecSum4HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), hvecSum4HH); + } /* End Kernel Height * Width */ + } /* End for (k = 0; k < row; k += 4)*/ + + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH; + valign vaBias = IVP_LAN_2X32_PP(phvecBias); + IVP_LAVN_2X32_XP(hvecBiasLL, vaBias, phvecBias, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecBiasLH, vaBias, phvecBias, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecBiasHL, vaBias, phvecBias, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecBiasHH, vaBias, phvecBias, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecBiasLL); + hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecBiasLH); + hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecBiasHL); + hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecBiasHH); + + hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecBiasLL); + hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecBiasLH); + hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecBiasHL); + hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecBiasHH); + + hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecBiasLL); + hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecBiasLH); + hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecBiasHL); + hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecBiasHH); + + hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecBiasLL); + hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecBiasLH); + hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecBiasHL); + hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecBiasHH); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecAcc1LL); + hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecAcc1LH); + hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecAcc1HL); + hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecAcc1HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecAcc2LL); + hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecAcc2LH); + hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecAcc2HL); + hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecAcc2HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecAcc3LL); + hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecAcc3LH); + hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecAcc3HL); + hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecAcc3HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecAcc4LL); + hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecAcc4LH); + hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecAcc4HL); + hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecAcc4HH); + } + + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; + + +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecSum1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecSum1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecSum2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecSum2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecSum3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecSum3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecSum4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecSum4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k, j; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + xb_vecN_2x32v hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH; + xb_vecN_2x32v hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH; + xb_vecN_2x32v hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH; + xb_vecN_2x32v hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataL, outScaleDataH; + /*Load output scale values*/ + valign vaScale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleDataL, vaScale, pOutScaleData, 2 * remainingOutCh); + IVP_LAVNX16_XP(outScaleDataH, vaScale, pOutScaleData, 2 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + hvecSum1LL = hvecSum1LH = hvecSum1HL = hvecSum1HH = 0; + hvecSum2LL = hvecSum2LH = hvecSum2HL = hvecSum2HH = 0; + hvecSum3LL = hvecSum3LH = hvecSum3HL = hvecSum3HH = 0; + hvecSum4LL = hvecSum4LH = hvecSum4HL = hvecSum4HH = 0; + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (j = 0; j < kWidthU * numInCh; j += 508) + { + xb_vec2Nx24 daccSum1 = 0, daccSum2 = 0, daccSum3 = 0, daccSum4 = 0; + int32_t numIter = XT_MIN(508, kWidthU * numInCh - j); + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + /* Corner case handling as numIter is not a multiple of 4 */ + if (k < numIter) + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End if( k < numIter)*/ + + hvecSum1LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum1), hvecSum1LL); + hvecSum1LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), hvecSum1LH); + hvecSum1HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum1), hvecSum1HL); + hvecSum1HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), hvecSum1HH); + + hvecSum2LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum2), hvecSum2LL); + hvecSum2LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), hvecSum2LH); + hvecSum2HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum2), hvecSum2HL); + hvecSum2HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), hvecSum2HH); + + hvecSum3LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum3), hvecSum3LL); + hvecSum3LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), hvecSum3LH); + hvecSum3HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum3), hvecSum3HL); + hvecSum3HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), hvecSum3HH); + + hvecSum4LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum4), hvecSum4LL); + hvecSum4LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), hvecSum4LH); + hvecSum4HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum4), hvecSum4HL); + hvecSum4HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), hvecSum4HH); + } + } /* End Kernel Height * Width */ + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH; + valign vaBias = IVP_LAN_2X32_PP(phvecBias); + IVP_LAVN_2X32_XP(hvecBiasLL, vaBias, phvecBias, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecBiasLH, vaBias, phvecBias, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecBiasHL, vaBias, phvecBias, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecBiasHH, vaBias, phvecBias, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecBiasLL); + hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecBiasLH); + hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecBiasHL); + hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecBiasHH); + + hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecBiasLL); + hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecBiasLH); + hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecBiasHL); + hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecBiasHH); + + hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecBiasLL); + hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecBiasLH); + hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecBiasHL); + hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecBiasHH); + + hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecBiasLL); + hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecBiasLH); + hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecBiasHL); + hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecBiasHH); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecAcc1LL); + hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecAcc1LH); + hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecAcc1HL); + hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecAcc1HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecAcc2LL); + hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecAcc2LH); + hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecAcc2HL); + hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecAcc2HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecAcc3LL); + hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecAcc3LH); + hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecAcc3HL); + hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecAcc3HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecAcc4LL); + hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecAcc4LH); + hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecAcc4HL); + hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecAcc4HH); + } + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecSum1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecSum1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecSum2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecSum2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecSum3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecSum3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecSum4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecSum4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k, j; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + xb_vecN_2x32v hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH; + xb_vecN_2x32v hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH; + xb_vecN_2x32v hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH; + xb_vecN_2x32v hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH; + /* Loops Start */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataL, outScaleDataH; + /*Load output scale values*/ + valign vaScale = IVP_LANX16U_PP(pOutScaleData); + IVP_LAVNX16_XP(outScaleDataL, vaScale, pOutScaleData, 2 * remainingOutCh); + IVP_LAVNX16_XP(outScaleDataH, vaScale, pOutScaleData, 2 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); +#endif + for (y = 0; y < outH; y += 2) /* Image Height */ + { /* walk down the rows */ + /* Variable to handle corner case when height is odd */ + int32_t numY = XT_MIN(1, outH - y - 1); + for (x = 0; x < outW; x += 2) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t numX = XT_MIN(1, outW - x - 1); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + hvecSum1LL = hvecSum1LH = hvecSum1HL = hvecSum1HH = 0; + hvecSum2LL = hvecSum2LH = hvecSum2HL = hvecSum2HH = 0; + hvecSum3LL = hvecSum3LH = hvecSum3HL = hvecSum3HH = 0; + hvecSum4LL = hvecSum4LH = hvecSum4HL = hvecSum4HH = 0; + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * numX); + pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideY * inDataPitch2 * numY); + pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + (strideX * inDataPitch1 + strideY * inDataPitch2) * numX * numY); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (j = 0; j < numInCh; j += 508) /* Emulation: To avoid 24 bit overflow 2^23-1 / 128 / 128 = 511.99 */ + { + xb_vec2Nx24 daccSum1 = 0, daccSum2 = 0, daccSum3 = 0, daccSum4 = 0; + int32_t numIter = XT_MIN(508, numInCh - j); + for (inCh = 0; inCh < numIter - 3; inCh += 4) + { + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End for (inCh = 0; inCh < row - 3; inCh += 4) */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numIter) + { + int32_t remInCh = numIter - inCh; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Corner case handling */ + + hvecSum1LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum1), hvecSum1LL); + hvecSum1LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum1), hvecSum1LH); + hvecSum1HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum1), hvecSum1HL); + hvecSum1HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum1), hvecSum1HH); + + hvecSum2LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum2), hvecSum2LL); + hvecSum2LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum2), hvecSum2LH); + hvecSum2HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum2), hvecSum2HL); + hvecSum2HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum2), hvecSum2HH); + + hvecSum3LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum3), hvecSum3LL); + hvecSum3LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum3), hvecSum3LH); + hvecSum3HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum3), hvecSum3HL); + hvecSum3HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum3), hvecSum3HH); + + hvecSum4LL = IVP_ADDN_2X32(IVP_CVT32S2NX24LL(daccSum4), hvecSum4LL); + hvecSum4LH = IVP_ADDN_2X32(IVP_CVT32S2NX24LH(daccSum4), hvecSum4LH); + hvecSum4HL = IVP_ADDN_2X32(IVP_CVT32S2NX24HL(daccSum4), hvecSum4HL); + hvecSum4HH = IVP_ADDN_2X32(IVP_CVT32S2NX24HH(daccSum4), hvecSum4HH); + } /* End for(j = 0; j < numInCh; j += 508)*/ + } /* End Kernel Height * Width */ + + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH; + valign vaBias = IVP_LAN_2X32_PP(phvecBias); + IVP_LAVN_2X32_XP(hvecBiasLL, vaBias, phvecBias, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecBiasLH, vaBias, phvecBias, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecBiasHL, vaBias, phvecBias, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecBiasHH, vaBias, phvecBias, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecBiasLL); + hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecBiasLH); + hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecBiasHL); + hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecBiasHH); + + hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecBiasLL); + hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecBiasLH); + hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecBiasHL); + hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecBiasHH); + + hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecBiasLL); + hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecBiasLH); + hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecBiasHL); + hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecBiasHH); + + hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecBiasLL); + hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecBiasLH); + hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecBiasHL); + hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecBiasHH); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum1LL = IVP_ADDN_2X32(hvecSum1LL, hvecAcc1LL); + hvecSum1LH = IVP_ADDN_2X32(hvecSum1LH, hvecAcc1LH); + hvecSum1HL = IVP_ADDN_2X32(hvecSum1HL, hvecAcc1HL); + hvecSum1HH = IVP_ADDN_2X32(hvecSum1HH, hvecAcc1HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + + hvecSum2LL = IVP_ADDN_2X32(hvecSum2LL, hvecAcc2LL); + hvecSum2LH = IVP_ADDN_2X32(hvecSum2LH, hvecAcc2LH); + hvecSum2HL = IVP_ADDN_2X32(hvecSum2HL, hvecAcc2HL); + hvecSum2HH = IVP_ADDN_2X32(hvecSum2HH, hvecAcc2HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + hvecSum3LL = IVP_ADDN_2X32(hvecSum3LL, hvecAcc3LL); + hvecSum3LH = IVP_ADDN_2X32(hvecSum3LH, hvecAcc3LH); + hvecSum3HL = IVP_ADDN_2X32(hvecSum3HL, hvecAcc3HL); + hvecSum3HH = IVP_ADDN_2X32(hvecSum3HH, hvecAcc3HH); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * numX + accDataPitch2 * numY); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + hvecSum4LL = IVP_ADDN_2X32(hvecSum4LL, hvecAcc4LL); + hvecSum4LH = IVP_ADDN_2X32(hvecSum4LH, hvecAcc4LH); + hvecSum4HL = IVP_ADDN_2X32(hvecSum4HL, hvecAcc4HL); + hvecSum4HH = IVP_ADDN_2X32(hvecSum4HH, hvecAcc4HH); + } + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \ + packShiftAccU, outScaleDataL, outScaleDataH, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1L, dvecOut1H, hvecSum1LL, hvecSum1LH, hvecSum1HL, hvecSum1HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut2L, dvecOut2H, hvecSum2LL, hvecSum2LH, hvecSum2HL, hvecSum2HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut3L, dvecOut3H, hvecSum3LL, hvecSum3LH, hvecSum3HL, hvecSum3HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut4L, dvecOut4H, hvecSum4LL, hvecSum4LH, hvecSum4HL, hvecSum4HH, \ + packShiftAccU, outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1) * numX * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch2) * numY * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numY); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * numX + outDataPitch2 * numY) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * numX * numY); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * numX * numY); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecSum1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecSum1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1) * numX); + IVP_SAVN_2X32_XP(hvecSum2LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX); + IVP_SAVN_2X32_XP(hvecSum2LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum2HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum2HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch2) * numY); + IVP_SAVN_2X32_XP(hvecSum3LL, vaOutData, phvecAcc, 4 * remainingOutCh * numY); + IVP_SAVN_2X32_XP(hvecSum3LH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum3HL, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum3HH, vaOutData, phvecAcc, 4 * remainingOutCh * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * numX + accDataPitch2 * numY)); + IVP_SAVN_2X32_XP(hvecSum4LL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY); + IVP_SAVN_2X32_XP(hvecSum4LH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum4HL, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecSum4HH, vaOutData, phvecAcc, 4 * remainingOutCh * numX * numY - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/***************************************************************************** +* xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH \ +* xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D partial */ +/* dilated convolution function and MxN MOD_DWH 3D VQ partial */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Accumulated value will be within 24bit range */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_POINTER(biasArray); + XAI_CHECK_POINTER(param); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV_PARTIAL + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + + if (XAI_CNN_CONV_GET_FLAG_INPUT(param)) + { + XAI_CHECK_ARRAY_S32(biasArray); + } + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + XAI_CHECK_TILE3D_S32(accTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile); + XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH); + XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile); + } + if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + XAI_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param))) + { + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile); + } + } + } +#ifndef DILATED_VQ_CONV_PARTIAL + if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \ + XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/ + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) && \ + ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \ + (XAI_TILE4D_GET_DIM3(coeffTile) == 1))) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); +} + +/***************************************************************************** +* xaiPartialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH \ +* xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D partial */ +/* dilated convolution function and MxN MOD_DWH 3D VQ partial */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData are U8, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Accumulated value will be within 24bit range */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_POINTER(biasArray); + XAI_CHECK_POINTER(param); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV_PARTIAL + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + + if (XAI_CNN_CONV_GET_FLAG_INPUT(param)) + { + XAI_CHECK_ARRAY_S32(biasArray); + } + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + XAI_CHECK_TILE3D_S32(accTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile); + XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH); + XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile); + } + if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + XAI_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param))) + { + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile); + } + } + } +#ifndef DILATED_VQ_CONV_PARTIAL + if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \ + XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/ + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) && \ + ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \ + (XAI_TILE4D_GET_DIM3(coeffTile) == 1))) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_U8S8IXCa2_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); +} + +/**********partialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH************/ +/**********partialConvolve3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH ************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8U *) (pData + inAddrOff + strideX * inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1; + xb_vec2Nx8 dvecData2; + xb_vec2Nx8 dvecData3; + xb_vec2Nx8 dvecData4; + + dvecData1 = IVP_SUB2NX8U(dvecInp1, 128); + dvecData2 = IVP_SUB2NX8U(dvecInp2, 128); + dvecData3 = IVP_SUB2NX8U(dvecInp3, 128); + dvecData4 = IVP_SUB2NX8U(dvecInp4, 128); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Input Channels */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + + IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { /* walk down the rows */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1; + xb_vec2Nx8 dvecData2; + xb_vec2Nx8 dvecData3; + xb_vec2Nx8 dvecData4; + + dvecData1 = IVP_SUB2NX8U(dvecInp1, 128); + dvecData2 = IVP_SUB2NX8U(dvecInp2, 128); + dvecData3 = IVP_SUB2NX8U(dvecInp3, 128); + dvecData4 = IVP_SUB2NX8U(dvecInp4, 128); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth)); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth)); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData is U8, CoeffData is S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + uint8_t *pInData = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8U* restrict pdvecData1; + xb_vec2Nx8U* restrict pdvecData2; + xb_vec2Nx8U* restrict pdvecData3; + xb_vec2Nx8U* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { /* walk down the rows */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + uint8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8U *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8U_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8U_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8U_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8U_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, 4); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, 4); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, 4); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, 4); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1; + xb_vec2Nx8 dvecData2; + xb_vec2Nx8 dvecData3; + xb_vec2Nx8 dvecData4; + + dvecData1 = IVP_SUB2NX8U(dvecInp1, 128); + dvecData2 = IVP_SUB2NX8U(dvecInp2, 128); + dvecData3 = IVP_SUB2NX8U(dvecInp3, 128); + dvecData4 = IVP_SUB2NX8U(dvecInp4, 128); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Input Channels */ + /* Corner case handling as numIter is not a multiple of 4 */ + if (k < numIter) + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8U dvecInp1; IVP_LAV2NX8U_XP(dvecInp1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8U dvecInp2; IVP_LAV2NX8U_XP(dvecInp2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8U dvecInp3; IVP_LAV2NX8U_XP(dvecInp3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8U dvecInp4; IVP_LAV2NX8U_XP(dvecInp4, vaData4, pdvecData4, remInCh); + +#ifdef IVP_MULSUQA2N8XR8 + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8U(dvecInp4)), 0); +#else + xb_vec2Nx8 dvecData1 = 0; + xb_vec2Nx8 dvecData2 = 0; + xb_vec2Nx8 dvecData3 = 0; + xb_vec2Nx8 dvecData4 = 0; + + IVP_SUB2NX8UT(dvecData1, dvecInp1, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData2, dvecInp2, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData3, dvecInp3, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + IVP_SUB2NX8UT(dvecData4, dvecInp4, 128, IVP_LT2NX8(IVP_SEQ2NX8U(), remInCh)); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); +#endif + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + +#ifdef IVP_MULSUQA2N8XR8 + IVP_MULSUQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULSUQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULSUQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULSUQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#else + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); +#endif + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth)); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth)); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/***************************************************************************** +* xaiPartialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH \ +* xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D partial */ +/* dilated convolution function and MxN MOD_DWH 3D VQ partial */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData are U8, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Accumulated value will be within 24bit range */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_U8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_POINTER(biasArray); + XAI_CHECK_POINTER(param); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV_PARTIAL + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + + if (XAI_CNN_CONV_GET_FLAG_INPUT(param)) + { + XAI_CHECK_ARRAY_S32(biasArray); + } + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + XAI_CHECK_TILE3D_S32(accTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile); + XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH); + XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile); + } + if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + XAI_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param))) + { + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile); + } + } + } +#ifndef DILATED_VQ_CONV_PARTIAL + if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \ + XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/ + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) && \ + ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \ + (XAI_TILE4D_GET_DIM3(coeffTile) == 1))) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_U8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D partial */ +/* dilated convolution function and MxN MOD_DWH 3D VQ partial */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Accumulated value will be within 32bit range */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_POINTER(biasArray); + XAI_CHECK_POINTER(param); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV_PARTIAL + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + + if (XAI_CNN_CONV_GET_FLAG_INPUT(param)) + { + XAI_CHECK_ARRAY_S32(biasArray); + } + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + XAI_CHECK_TILE3D_S32(accTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile); + XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH); + XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile); + } + if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + XAI_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param))) + { + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile); + } + } + } +#ifndef DILATED_VQ_CONV_PARTIAL + if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \ + XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + + /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/ + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) && \ + ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \ + (XAI_TILE4D_GET_DIM3(coeffTile) == 1))) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_S8S8IXCa2_MOD_DWH_QM32_contiguous_depth(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { /* walk down the rows */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (k = 0; k < numIter; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth)); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth)); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ + +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t dilationX = 1; + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t numIter = kWidthU * numInCh; + + int32_t dilatedKWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationY * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t outCh, x, y, ky, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* + * inCh and kWidth loops are combined. Assumed that the + * edges along Depth dimension of input data is zero and also + * edges along depth dimension of coefficient data is zero. + */ + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { /* walk down the rows */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + /* Variable to handle corner case when width is odd */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + +#ifdef __XCC__ +#pragma loop_count min=1 +#endif + for (ky = 0; ky < kHeightU; ky++) /* Kernel Height */ + { + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY); + pdvecData2 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8 *) (pData + ky * inDataPitch2 * dilationY + strideX * inDataPitch1 * 3 * enable4thWidth); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + ky * coeffPitch3); + + /* Primes for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (k = 0; k < numIter - 3; k += 4) /* (Input Channels * kWidth) loops combined */ + { + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + /* Corner case handling as numIter is not a multiple of 4 */ + if (k < numIter) + { + int32_t remInCh = numIter - k; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End if( k < numIter)*/ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * \ + remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth)); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth)); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/****************************************************************************/ +/* Description : P6 optimized implementation of 3D partial convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* CNN convolution params structure */ +/* InOuts : Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Edges along Depth dimension in inTile and coeffTile */ +/* are zero. */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t outW = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outH = XAI_TILE3D_GET_DIM3(outTile); + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const uint8_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); + const uint8_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); + + /* Kernel Size (NDWH) */ + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + int32_t dilatedkWidthU = dilationX * (kWidthU - 1) + 1; + int32_t dilatedkHeightU = dilationY * (kHeightU - 1) + 1; + + /* CNN convolution parameters */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pInData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pOutData = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pCoeffData = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBiasData = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + int32_t * pAccData = NULL; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int32_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + uint16_t *pScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Pitches of Coefficient Data (NDWH) in dim1, dim2 and dim3 */ + const int32_t coeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + + /* Pitches of Input Data (DWH) in dim1 and dim2 */ + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + /* Pitch of Output Data (DWH) in dim1 and dim2 */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Pitch of AccTile Data (DWH) in dim1 and dim2 */ + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t leftEdge, topEdge; + if ((dilatedkWidthU % 2) != 0) + { + leftEdge = dilatedkWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedkWidthU / 2) : ((dilatedkWidthU / 2) - 1); + } + + if ((dilatedkHeightU % 2) != 0) + { + topEdge = dilatedkHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedkHeightU / 2) : ((dilatedkHeightU / 2) - 1); + } + + + /* Move pointer to the start of the data (including edge) */ + pInData = &pInData[-((leftEdge) * inDataPitch1 + (topEdge) * inDataPitch2)]; + + /* Setting the limits for output data according to ReLu Flag and outTileType */ + int32_t minLim, maxLim; + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? \ + SHRT_MIN : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MIN : 0); + maxLim = XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX \ + : (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8) ? SCHAR_MAX : UCHAR_MAX); + } + const int8_t typeFlag = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) ? 1 : 0; + const uint8_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(outTile); + + /* Variable Declarations */ + int32_t inCh, outCh, x, y, k; + valign vaOutData = IVP_ZALIGN(); + + xb_vecN_2x32v* restrict phvecBias; + xb_vec2Nx8* restrict pdvecCoeff; + xb_vec2Nx8* restrict pdvecData1; + xb_vec2Nx8* restrict pdvecData2; + xb_vec2Nx8* restrict pdvecData3; + xb_vec2Nx8* restrict pdvecData4; + xb_vec2Nx8* restrict pdvecOut; + xb_vecN_2x32v* restrict phvecAcc; + + /* Loops Start */ + for (y = 0; y < outH; y++) /* Image Height */ + { /* walk down the rows */ + for (outCh = 0; outCh < numOutCh; outCh += 2 * XCHAL_IVPN_SIMD_WIDTH) + { /* walk across the kernels */ + /* To handle corner case when number of output channels + * is not a multiple of 2 * XCHAL_IVPN_SIMD_WIDTH*/ + int32_t remainingOutCh = numOutCh - outCh; +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U outScaleDataEven, outScaleDataOdd; + /*Load output scale values*/ + xb_vecNx16U* restrict pOutScaleData = (xb_vecNx16U *) (pScale + outCh); + VQ_INIT_OUTSCALE(pOutScaleData, remainingOutCh, outScaleDataEven, outScaleDataOdd); +#endif + for (x = 0; x < outW; x += 4) /* Image Width */ + { /* walk across the columns */ + int32_t enable2ndWidth = XT_SALT(1, outW - x); + int32_t enable3rdWidth = XT_SALT(2, outW - x); + int32_t enable4thWidth = XT_SALT(3, outW - x); + /* Output Data pointer */ + int8_t *pOut = pOutData + (x * outDataPitch1 + y * outDataPitch2) * bytesPerPixel; + int32_t *pAcc = pAccData + (x * accDataPitch1 + y * accDataPitch2); + + /* Initialize accumulators with bias values */ + xb_vec2Nx24 daccSum1, daccSum2, daccSum3, daccSum4; + if (inputFlag) /* Bias Values */ + { + phvecBias = (xb_vecN_2x32v *) (pBiasData + outCh); + ACC_INIT_BIAS(phvecBias, remainingOutCh, daccSum1, daccSum2, daccSum3, daccSum4); + } + else /* Accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL, hvecAcc1LH, hvecAcc1HL, hvecAcc1HH; + xb_vecN_2x32v hvecAcc2LL, hvecAcc2LH, hvecAcc2HL, hvecAcc2HH; + xb_vecN_2x32v hvecAcc3LL, hvecAcc3LH, hvecAcc3HL, hvecAcc3HH; + xb_vecN_2x32v hvecAcc4LL, hvecAcc4LH, hvecAcc4HL, hvecAcc4HH; + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + valign vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc1LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc1LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc1HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum1 = IVP_CVT24UNX32L(hvecAcc1LH, hvecAcc1LL); + IVP_CVT24UNX32H(daccSum1, hvecAcc1HH, hvecAcc1HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * enable2ndWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc2LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc2LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc2HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum2 = IVP_CVT24UNX32L(hvecAcc2LH, hvecAcc2LL); + IVP_CVT24UNX32H(daccSum2, hvecAcc2HH, hvecAcc2HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 2 * enable3rdWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc3LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc3LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc3HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum3 = IVP_CVT24UNX32L(hvecAcc3LH, hvecAcc3LL); + IVP_CVT24UNX32H(daccSum3, hvecAcc3HH, hvecAcc3HL); + + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh + accDataPitch1 * 3 * enable4thWidth); + vaAcc = IVP_LAN_2X32_PP(phvecAcc); + IVP_LAVN_2X32_XP(hvecAcc4LL, vaAcc, phvecAcc, 4 * remainingOutCh); + IVP_LAVN_2X32_XP(hvecAcc4LH, vaAcc, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HL, vaAcc, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_LAVN_2X32_XP(hvecAcc4HH, vaAcc, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + daccSum4 = IVP_CVT24UNX32L(hvecAcc4LH, hvecAcc4LL); + IVP_CVT24UNX32H(daccSum4, hvecAcc4HH, hvecAcc4HL); + } + + /* Input Data and Coeff Data Pointers */ + int8_t *pData = pInData + x * strideX * inDataPitch1 + y * strideY * inDataPitch2; + int8_t *pCoeff = pCoeffData + outCh; + + xb_vecN_2x32v hvecInAddrOff = 0; + xb_vecN_2x32v hvecCoeffAddrOff = 0; + xb_vecN_2x32v hvecLaneIdx = 0; + int32_t inAddrOff, coeffAddrOff; + + for (k = 0; k < kHeightU * kWidthU; k++) /* Kernel Height * Kernel Width */ + { + /* Condition checks performed to get the Input and Coefficient */ + /* Pointer Offsets after combining the Kernel Width and Height Loops */ + vboolN_2 vbN_2 = IVP_EQN_2X32(hvecLaneIdx, kWidthU); + /* hvecLaneIdx will be reset to zero after every kWidth */ + hvecLaneIdx = IVP_MOVN_2X32T(0, hvecLaneIdx, vbN_2); + /* InPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecInAddrOff, hvecInAddrOff, inDataPitch2 * dilationY - kWidthU * inDataPitch1 * dilationX, vbN_2); + /* CoeffPitch added after every kWidth */ + IVP_ADDN_2X32T(hvecCoeffAddrOff, hvecCoeffAddrOff, coeffPitch3 - kWidthU * coeffPitch2, vbN_2); + /* Extracting Input and Coefficient address offsets */ + inAddrOff = IVP_EXTRN_2X32(hvecInAddrOff, 0); + coeffAddrOff = IVP_EXTRN_2X32(hvecCoeffAddrOff, 0); + hvecLaneIdx = IVP_ADDN_2X32(hvecLaneIdx, 1); + hvecCoeffAddrOff = IVP_ADDN_2X32(hvecCoeffAddrOff, coeffPitch2); + hvecInAddrOff = IVP_ADDN_2X32(hvecInAddrOff, inDataPitch1 * dilationX); + + /* Pointers for Input Data Loads */ + pdvecData1 = (xb_vec2Nx8 *) (pData + inAddrOff); + pdvecData2 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * enable2ndWidth); + pdvecData3 = (xb_vec2Nx8 *) (pData + inAddrOff + strideX * inDataPitch1 * 2 * enable3rdWidth); + pdvecData4 = (xb_vec2Nx8 *) (pData + inAddrOff + (strideX * inDataPitch1 * 3 * enable4thWidth)); + + /* Pointer for Coefficient Load */ + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffAddrOff); + + /* Primes registers for Aligning Load */ + valign vaData1 = IVP_LA2NX8_PP(pdvecData1); + valign vaData2 = IVP_LA2NX8_PP(pdvecData2); + valign vaData3 = IVP_LA2NX8_PP(pdvecData3); + valign vaData4 = IVP_LA2NX8_PP(pdvecData4); + + for (inCh = 0; inCh < numInCh - 3; inCh += 4) /* Input Channels */ + { + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, 4); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, 4); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, 4); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, 4); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* Aligned Vector Loads of coefficients */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + xb_vec2Nx8 dvecCoeff4; IVP_LV2NX8_XP(dvecCoeff4, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, dvecCoeff4, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Input Channels */ + + /* Corner Case Handling if number of input channels not multiple of 4 */ + if (inCh < numInCh) + { + int32_t remInCh = numInCh - inCh; + + /* Aligning variable vector load of pixels */ + xb_vec2Nx8 dvecData1; IVP_LAV2NX8_XP(dvecData1, vaData1, pdvecData1, remInCh); + xb_vec2Nx8 dvecData2; IVP_LAV2NX8_XP(dvecData2, vaData2, pdvecData2, remInCh); + xb_vec2Nx8 dvecData3; IVP_LAV2NX8_XP(dvecData3, vaData3, pdvecData3, remInCh); + xb_vec2Nx8 dvecData4; IVP_LAV2NX8_XP(dvecData4, vaData4, pdvecData4, remInCh); + + /* Extracting first 4 bytes of vector into address register */ + /* Scalar integers to be used for QMUL */ + int32_t qmulScalar1 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData1)), 0); + int32_t qmulScalar2 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData2)), 0); + int32_t qmulScalar3 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData3)), 0); + int32_t qmulScalar4 = IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16 \ + (IVP_MOVNX16_FROM2NX8(dvecData4)), 0); + + /* For conditional coefficient loads */ + int32_t enable2 = XT_SALT(1, remInCh); /* Will be 1 if remInCh > 1 */ + int32_t enable3 = XT_SALT(2, remInCh); /* Will be 1 if remInCh > 2 */ + + /* Coefficient Loads */ + xb_vec2Nx8 dvecCoeff1; IVP_LV2NX8_XP(dvecCoeff1, pdvecCoeff, coeffPitch1 * enable2); + xb_vec2Nx8 dvecCoeff2; IVP_LV2NX8_XP(dvecCoeff2, pdvecCoeff, coeffPitch1 * enable3); + xb_vec2Nx8 dvecCoeff3; IVP_LV2NX8_XP(dvecCoeff3, pdvecCoeff, coeffPitch1); + + IVP_MULQA2N8XR8(daccSum1, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar1); + IVP_MULQA2N8XR8(daccSum2, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar2); + IVP_MULQA2N8XR8(daccSum3, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar3); + IVP_MULQA2N8XR8(daccSum4, 0, dvecCoeff3, dvecCoeff2, dvecCoeff1, qmulScalar4); + } /* End Corner case handling */ + } /* End Kernel Height * Width */ + + if (outputFlag) /* Store to ouput Tile*/ + { + /* Pack, Output Scale, Output Shift and clamping */ + xb_vec2Nx8 dvecOut1L, dvecOut2L, dvecOut3L, dvecOut4L; + xb_vec2Nx8 dvecOut1H, dvecOut2H, dvecOut3H, dvecOut4H; +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScaleDataEven, outScaleDataOdd, outShiftU, minLim, maxLim, typeFlag); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1L, dvecOut1H, daccSum1, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut2L, dvecOut2H, daccSum2, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut3L, dvecOut3H, daccSum3, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); + PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut4L, dvecOut4H, daccSum4, packShiftAccU, \ + outScale, outShiftU, minLim, maxLim, typeFlag); +#endif + /* Store the output dvecOut1 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + outCh * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut1L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh); + IVP_SAV2NX8_XP(dvecOut1H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut2 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * enable2ndWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut2L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable2ndWidth); + IVP_SAV2NX8_XP(dvecOut2H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable2ndWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut3 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 2 * enable3rdWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut3L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable3rdWidth); + IVP_SAV2NX8_XP(dvecOut3H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable3rdWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + + /* Store the output dvecOut4 along the output depth */ + pdvecOut = (xb_vec2Nx8 *) (pOut + (outCh + outDataPitch1 * 3 * enable4thWidth) * bytesPerPixel); + IVP_SAV2NX8_XP(dvecOut4L, vaOutData, pdvecOut, bytesPerPixel * remainingOutCh * enable4thWidth); + IVP_SAV2NX8_XP(dvecOut4H, vaOutData, pdvecOut, typeFlag * 2 * \ + (remainingOutCh - XCHAL_IVPN_SIMD_WIDTH) * enable4thWidth); + IVP_SAPOS2NX8_FP(vaOutData, pdvecOut); + } + else /* Store to accumulator tile*/ + { + xb_vecN_2x32v hvecAcc1LL = IVP_CVT32S2NX24LL(daccSum1); + xb_vecN_2x32v hvecAcc1LH = IVP_CVT32S2NX24LH(daccSum1); + xb_vecN_2x32v hvecAcc1HL = IVP_CVT32S2NX24HL(daccSum1); + xb_vecN_2x32v hvecAcc1HH = IVP_CVT32S2NX24HH(daccSum1); + + xb_vecN_2x32v hvecAcc2LL = IVP_CVT32S2NX24LL(daccSum2); + xb_vecN_2x32v hvecAcc2LH = IVP_CVT32S2NX24LH(daccSum2); + xb_vecN_2x32v hvecAcc2HL = IVP_CVT32S2NX24HL(daccSum2); + xb_vecN_2x32v hvecAcc2HH = IVP_CVT32S2NX24HH(daccSum2); + + xb_vecN_2x32v hvecAcc3LL = IVP_CVT32S2NX24LL(daccSum3); + xb_vecN_2x32v hvecAcc3LH = IVP_CVT32S2NX24LH(daccSum3); + xb_vecN_2x32v hvecAcc3HL = IVP_CVT32S2NX24HL(daccSum3); + xb_vecN_2x32v hvecAcc3HH = IVP_CVT32S2NX24HH(daccSum3); + + xb_vecN_2x32v hvecAcc4LL = IVP_CVT32S2NX24LL(daccSum4); + xb_vecN_2x32v hvecAcc4LH = IVP_CVT32S2NX24LH(daccSum4); + xb_vecN_2x32v hvecAcc4HL = IVP_CVT32S2NX24HL(daccSum4); + xb_vecN_2x32v hvecAcc4HH = IVP_CVT32S2NX24HH(daccSum4); + + + /* Store the hvecAcc1 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + outCh); + IVP_SAVN_2X32_XP(hvecAcc1LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc1LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc1HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc2 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * enable2ndWidth)); + IVP_SAVN_2X32_XP(hvecAcc2LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc2LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc2HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc3 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 2 * enable3rdWidth)); + IVP_SAVN_2X32_XP(hvecAcc3LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc3LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc3HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + + /* Store the hvecAcc4 along the accTile depth */ + phvecAcc = (xb_vecN_2x32v *) (pAcc + (outCh + accDataPitch1 * 3 * enable4thWidth)); + IVP_SAVN_2X32_XP(hvecAcc4LL, vaOutData, phvecAcc, 4 * remainingOutCh); + IVP_SAVN_2X32_XP(hvecAcc4LH, vaOutData, phvecAcc, 4 * remainingOutCh - 2 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HL, vaOutData, phvecAcc, 4 * remainingOutCh - 4 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAVN_2X32_XP(hvecAcc4HH, vaOutData, phvecAcc, 4 * remainingOutCh - 6 * XCHAL_IVPN_SIMD_WIDTH); + IVP_SAPOSN_2X32_FP(vaOutData, phvecAcc); + } + } /* End image width */ + } /* End image height */ + } /* End Output Channels */ +} + +/***************************************************************************** +* xaiPartialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH \ +* xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH +* **************************************************************************/ + +/****************************************************************************/ +/* Description : P6 optimized generic implementation for MxN MOD_DWH */ +/* 3D convolution. Based on pre-processor specifiers. Code */ +/* implementation is generated during preprocessing stage. */ +/* This method can be used to generate MxN MOD_DWH 3D partial */ +/* dilated convolution function and MxN MOD_DWH 3D VQ partial */ +/* dilated convolution function */ +/* Stride values = 1, 2 and 4 are supported */ +/* Implementation also supports dilation >= 1 for stride = 1 */ +/* and dilation = 1 for stride = 2, 4 */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, */ +/* Output scale array, CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData, CoeffData are S8 */ +/* biasArray is signed 32b, value not exceeding signed 24b */ +/* Output scale array is U16 */ +/* OutData is S8 / U8 / S16 */ +/* Kernel Size is MxNxDxNk. M and N sizes are less than or */ +/* equal to 16. */ +/* Input and Output are in DWH format */ +/* Coeff is in NDWH format */ +/* CoeffDim1Pitch is aligned to 2N (Ca2) */ +/* Accumulated value will be within 24bit range */ +/****************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#else +XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH( + const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param + ) +#endif +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S8(inTile); + XAI_CHECK_CONV_OUTPUT_TILE3D(outTile); + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_POINTER(biasArray); + XAI_CHECK_POINTER(param); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONX(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationX = %hhu\nDilationX should be 1. It can be greater than 1 only when strideX is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONX(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATIONY(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATIONY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilationY = %hhu\nDilationY should be 1. It can be greater than 1 only when strideY is equal to 1", \ + XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE4D_IALIGNMENT_2NX8(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 24, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 24", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV_PARTIAL + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + + if (XAI_CNN_CONV_GET_FLAG_INPUT(param)) + { + XAI_CHECK_ARRAY_S32(biasArray); + } + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + XAI_CHECK_TILE3D_S32(accTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile); + XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH); + XAI_CHECK_TILE3D_SIZE_EQ(accTile, outTile); + } + if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + XAI_CHECK_TILE3D(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param))) + { + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile); + } + } + } +#ifndef DILATED_VQ_CONV_PARTIAL + if ((XAI_CNN_CONV_GET_OUTPUT_SCALE(param) == 0) && \ + XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + int32_t fillValue; + int32_t reluFlag = XAI_CNN_CONV_GET_FLAG_RELU(param); + fillValue = reluFlag ? (CLAMP(0, XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param))) : 0; + return(xaiFillTile3D(outTile, fillValue, 0)); + } +#endif + /* Calling further optimized function if dilation = 1 and (no edges along depth or kernelWidth = 1)*/ + if ((XAI_CNN_CONV_GET_DILATIONX(param) == 1) && \ + ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) || \ + (XAI_TILE4D_GET_DIM3(coeffTile) == 1))) + { + if ((XAI_TILE3D_GET_DIM1(inTile) * XAI_TILE4D_GET_DIM3(coeffTile)) % 4 == 0) + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth_x4(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxNd1_S8S8IXCa2_noUnrollH_MOD_DWH_contiguous_depth(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + } + else + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(inTile, \ + coeffTile, biasArray, outputScaleArray, accTile, outTile, param); +#else + partialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(inTile, \ + coeffTile, biasArray, accTile, outTile, param); +#endif + } + return(XAI_ERROR_STATUS()); +} +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c new file mode 100644 index 00000000000..2adb107cd07 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#undef DILATED_VQ_CONV_PARTIAL +#include "cnn_dilated_conv_partial_MOD_S16.h" +#endif /*#if ((XCHAL_VISION_TYPE >= 6))*/ diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h new file mode 100644 index 00000000000..f2ce98ced0d --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_dilated_conv_partial_MOD_S16.h @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2023 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" +#include "limits.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +/********* partialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth ***********/ +/********** partialConvolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth ************/ +/***********************************************************************************/ +/* Description : Specialized optimized implementation for partial 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, Output Scale Array, */ +/* CNN convolution params structure */ +/* Outputs : */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData, CoeffData are S16 */ +/* OutData is U16 / S16 */ +/* Input is in DWH and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* Input does not have edges along the depth dimension */ +/* dilationX = dilationY = 1 always */ +/* Accumulated value will be within 48-bit range */ +/***********************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +static _XAI_INLINE_ void partialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#else +static _XAI_INLINE_ void partialConvolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#endif +{ + /* Getting parameters from the tile structures */ + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outWidth = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outHeight = XAI_TILE3D_GET_DIM3(outTile); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t coeffDataPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffDataPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* Convolution params */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + +#ifdef DILATED_VQ_CONV_PARTIAL + const uint16_t *pOutputScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Data Pointers of input, coefficient, biasData */ + const int16_t *pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + const int16_t *pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + const int64_t *pBiasData = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + /* Data Pointers of output and scratch buffer data */ + int16_t *pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t *pAccData = NULL; + + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int64_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t leftEdge, topEdge; + + if ((kWidthU % 2) != 0) + { + leftEdge = kWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (kWidthU / 2) : ((kWidthU / 2) - 1); + } + + if ((kHeightU % 2) != 0) + { + topEdge = kHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (kHeightU / 2) : ((kHeightU / 2) - 1); + } + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-(int32_t) ((topEdge) * inDataPitch2 + (leftEdge) * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu is enabled or not*/ + int32_t minLim, maxLim; + + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0); + maxLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX); + } + + int32_t outCh, x, y, ky, numIter, iter; + + numIter = (numInCh * kWidthU); + + xb_vecN_2x32v *restrict phvecIn1; + xb_vecN_2x32v *restrict phvecIn2; + xb_vecN_2x32v *restrict phvecIn3; + xb_vecN_2x32v *restrict phvecIn4; + xb_vecNx16 *restrict pvecCoeff; + xb_vec2Nx8 *restrict pdvecBias; + xb_vec2Nx8 *restrict pdvecAccData; + xb_vecNx16 *restrict pvecOut; + + xb_vecNx48 vecAcc1 = 0, vecAcc2 = 0, vecAcc3 = 0, vecAcc4 = 0, vecBias = 0; + xb_vecN_2x32v hvecIn1, hvecIn2, hvecIn3, hvecIn4; + xb_vecNx16 vecCoeff1, vecCoeff2; + xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4; + xb_vec2Nx8 dvecAccLL, dvecAccLH, dvecAccHL, dvecAccHH; + + valign vaIn1, vaIn2, vaIn3, vaIn4, vaBias, vaAcc; + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U vecOutScaleU; + xb_vecNx16U *restrict pvecOutScaleData = (xb_vecNx16U *) (pOutputScaleData); + valign vaScale = IVP_LANX16U_PP(pvecOutScaleData); +#endif + + pdvecBias = (xb_vec2Nx8 *) (pBiasData); + vaBias = IVP_LA2NX8_PP(pdvecBias); + valign vaOut = IVP_ZALIGN(); + + for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH) + { + int32_t remOutCh = (numOutCh - outCh); + /* Initially the accumulators with the 48-bit bias values */ + if (inputFlag) // Biases will be loaded only when "inputFlag" is set + { + ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remOutCh, vecBias); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + IVP_LAVNX16U_XP(vecOutScaleU, vaScale, pvecOutScaleData, 2 * remOutCh); +#endif + + for (y = 0; y < outHeight; y += 2) + { + // Calculating "remY" for integrated tail-handling purpose + int32_t remY = XT_MIN(1, outHeight - y - 1); + for (x = 0; x < outWidth; x += 2) + { + // Calculating "remX" for integrated tail-handling purpose + int32_t remX = XT_MIN(1, outWidth - x - 1); + int16_t *pData1 = (int16_t *) (pInData + (x * strideX * inDataPitch1) + (y * strideY * inDataPitch2)); + int64_t *pAcc = (int64_t *) (pAccData + outCh + (x * accDataPitch1) + (y * accDataPitch2)); + + if (inputFlag) // if "inputFlag" is set, then initialize the accumulators with the bias values + { + /* Initializing all the 4 accumulators with bias values before accumulating for every spatial location */ + vecAcc4 = vecAcc3 = vecAcc2 = vecAcc1 = vecBias; + } + else // if "inputFlag" is not-set, then initialize the accumulators with the values stored in the accTile + { + // Loading accumulated values from W = 0, H = 0 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc1 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc1, dvecAccHH, dvecAccHL); + + // Loading accumulated values form W = 1, H = 0 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1)); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc2 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc2, dvecAccHH, dvecAccHL); + + // Loading accumulated values form W = 0, H = 1 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2)); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc3 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc3, dvecAccHH, dvecAccHL); + + // Loading accumulated values form W = 1, H = 1 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2)); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc4 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc4, dvecAccHH, dvecAccHL); + } + + for (ky = 0; ky < kHeightU; ky++) + { + // Adjusting the coefficient data pointer + pvecCoeff = (xb_vecNx16 *) (pCoeffData + outCh + (ky * coeffDataPitch3)); + int16_t *pData2 = (int16_t *) (pData1 + (ky * inDataPitch2)); + // phvecIn1 initially points to W = 0, H = 0 spatial location + phvecIn1 = (xb_vecN_2x32v *) (pData2); + // phvecIn2 initially points to W = 1, H = 0 spatial location + phvecIn2 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1)); + // phvecIn3 initially points to W = 0, H = 1 spatial location + phvecIn3 = (xb_vecN_2x32v *) (pData2 + (remY * strideY * inDataPitch2)); + // phvecIn4 initially points to W = 1, H = 1 spatial location + phvecIn4 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1) + (remY * strideY * inDataPitch2)); + + vaIn1 = IVP_LAN_2X32_PP(phvecIn1); + vaIn2 = IVP_LAN_2X32_PP(phvecIn2); + vaIn3 = IVP_LAN_2X32_PP(phvecIn3); + vaIn4 = IVP_LAN_2X32_PP(phvecIn4); + + for (iter = 0; iter < (numIter - 1); iter += 2) + { + // hvecIn1 contains 4 bytes or 2 elements along D from W = 0, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 4); + // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 4); + // hvecIn2 contains 4 bytes or 2 elements along D from W = 0, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 4); + // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 4); + + // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0) initially + IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1)); + // vecCoeff2 contains 64 bytes or 32 elements along output depth (N) from next input depth (D = 1) initially + IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, (2 * coeffDataPitch1)); + + // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0)); + // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0)); + // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0)); + // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0)); + } // End of for (iter = 0; iter < (numIter - 1); iter += 2) + if (iter < numIter) + { + // hvecIn1 contains 2 bytes or 1 element along D from W = 0, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 2); + // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 2); + // hvecIn2 contains 2 bytes or 1 element along D from W = 0, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 2); + // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 2); + + // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0) + IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1)); + + // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc1, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0)); + // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc2, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0)); + // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc3, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0)); + // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc4, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0)); + } + } // End of for (ky = 0; ky < kHeightU; ky++) + + if (outputFlag) // if "outputFlag" is set, apply pack, scale, shift, clamp logic on accumulated values and store the output + { + /* Pack, scale, shift, clamp logic to follow */ +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, vecAcc1, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, vecAcc2, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, vecAcc3, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, vecAcc4, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, vecAcc1, packShiftAccU, outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, vecAcc2, packShiftAccU, outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, vecAcc3, packShiftAccU, outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, vecAcc4, packShiftAccU, outScale, outShiftU, minLim, maxLim); +#endif + // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 0 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + (y * outDataPitch2)); + IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (2 * remOutCh)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + + // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 0 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + (y * outDataPitch2)); + IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (2 * remOutCh) * remX); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + + // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 1 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + ((y + remY) * outDataPitch2)); + IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (2 * remOutCh) * remY); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + + // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 1 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + ((y + remY) * outDataPitch2)); + IVP_SAVNX16_XP(vecOut4, vaOut, pvecOut, (2 * remOutCh) * remX * remY); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + else // if "outputFlag" is not-set, store the accumulated values to the accTile + { + vaAcc = IVP_ZALIGN(); + dvecAccLL = IVP_CVT64SNX48LL(vecAcc1); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc1); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc1); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc1); + // Storing 32 elements at W = 0, H = 0 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + + dvecAccLL = IVP_CVT64SNX48LL(vecAcc2); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc2); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc2); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc2); + // Storing 32 elements at W = 1, H = 0 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1)); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + + dvecAccLL = IVP_CVT64SNX48LL(vecAcc3); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc3); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc3); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc3); + // Storing 32 elements at W = 0, H = 1 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2)); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + + dvecAccLL = IVP_CVT64SNX48LL(vecAcc4); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc4); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc4); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc4); + // Storing 32 elements at W = 1, H = 1 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2)); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + } + } // End of for (x = 0; x < outWidth; x += 2) + } // End of for (y = 0; y < outHeight; y += 2) + } // End of for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH) +} + +/***************** xaiPartialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH *****************/ +/****************** xaiPartialConvolved3D_S_MxN_S16S16I16_MOD_DWH ******************/ +/***********************************************************************************/ +/* Description : Optimized implementation for partial 3D convolution */ +/* Inputs : Input Data Tile, Coeff Data Tile, Bias Array, Output Scale Array, */ +/* CNN convolution params structure */ +/* Outputs : XI Error Code */ +/* InOuts : Accumulator Tile, Output Tile */ +/* Assumptions : InData, CoeffData are S16 */ +/* OutData is U16 / S16 */ +/* Input is in DWH and Output is in DWH format */ +/* Coeff is in NDWH format */ +/* Accumulated value will be within 48-bit range */ +/***********************************************************************************/ +#ifdef DILATED_VQ_CONV_PARTIAL +XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#else +XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param) +#endif +{ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S16(inTile); + XAI_CHECK_TILE4D_S16(coeffTile); + XAI_CHECK_POINTER(biasArray); + XAI_CHECK_POINTER(param); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile); + XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(coeffTile); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffTile) <= 64) && (XAI_TILE4D_GET_DIM4(coeffTile) <= 64), XAI_ERR_KSIZE, \ + "\nKernel height = %d and width = %d\nKernel width and height should be less than or equal to 64", \ + XAI_TILE4D_GET_DIM4(coeffTile), XAI_TILE4D_GET_DIM3(coeffTile)); + XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) > 0) && (XAI_CNN_CONV_GET_STRIDEY(param) > 0)) && \ + ((XAI_CNN_CONV_GET_STRIDEX(param) <= 64) && (XAI_CNN_CONV_GET_STRIDEY(param) <= 64)), XAI_ERR_BADARG, \ + "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height should be greater than 0 and less than or equal to 64", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_CNN_CONV_GET_STRIDEY(param)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1) || \ + ((XAI_CNN_CONV_GET_DILATION(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) == 1) && (XAI_CNN_CONV_GET_STRIDEY(param) == 1)), XAI_ERR_BADARG, \ + "\nDilation = %hhu\nDilation should be 1. It can be greater than 1 only when stride is equal to 1", \ + XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width = %hhu and height = %hhu\nDilation along width and height should be equal", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_DWH); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_ACCUM_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe accumulator shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_ACCUM_SHIFT(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_OUTPUT_SHIFT(param) < 32, \ + XAI_ERR_NORM, "\nThe output shift = %hhu, value should be less than 32", \ + XAI_CNN_CONV_GET_OUTPUT_SHIFT(param)); + XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile); +#ifdef DILATED_VQ_CONV_PARTIAL + XAI_CHECK_ARRAY_U16(outputScaleArray); + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE4D_GET_DIM1(coeffTile), XAI_ERR_DATASIZE, \ + "\nWidth of Output Scale Array = %d, Number of Kernels = %d\nWidth of Output Scale Array should be greater than or equal to Number of Kernels", \ + XAI_ARRAY_GET_WIDTH(outputScaleArray), XAI_TILE4D_GET_DIM1(coeffTile)); +#endif + XAI_CHECK_CONSISTENCY_MOD_DWH(inTile, coeffTile, biasArray, outTile, param); + + if (XAI_CNN_CONV_GET_FLAG_INPUT(param)) + { + XAI_CHECK_ARRAY_S64(biasArray); + } + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + XAI_CHECK_TILE3D_S64(accTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, accTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, accTile); + XAI_CHECK_TILE3D_DATA_ORDER(accTile, XAI_DWH); + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(accTile) >= XAI_TILE3D_GET_DIM1(outTile)), XAI_ERR_DATASIZE, \ + "\ndim1Size of accTile = %d, should be greater than or equal to %d(dim1Size of outTile)", \ + XAI_TILE3D_GET_DIM1(accTile), XAI_TILE3D_GET_DIM1(outTile)); + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(accTile) >= XAI_TILE3D_GET_DIM2(outTile)), XAI_ERR_DATASIZE, \ + "\ndim2Size of accTile = %d, should be greater than or equal to %d(dim2Size of outTile)", \ + XAI_TILE3D_GET_DIM2(accTile), XAI_TILE3D_GET_DIM2(outTile)); + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(accTile) >= XAI_TILE3D_GET_DIM3(outTile)), XAI_ERR_DATASIZE, \ + "\ndim3Size of accTile = %d, should be greater than or equal to %d(dim3Size of outTile)", \ + XAI_TILE3D_GET_DIM3(accTile), XAI_TILE3D_GET_DIM3(outTile)); + } + if (XAI_CNN_CONV_GET_FLAG_OUTPUT(param)) + { + XAI_CHECK_ERROR(XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) || XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16), \ + XAI_ERR_DATATYPE, "\nOutTile data type need to be either XAI_S16 or XAI_U16"); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTile); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(coeffTile, outTile); + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param))) + { + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(accTile, outTile); + } + } + } + + const uint8_t dilationU = XAI_CNN_CONV_GET_DILATION(param); + + /* Calling further optimized variant based on certain conditions */ + if ((XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1_PITCH(inTile)) && (dilationU == 1)) + { +#ifdef DILATED_VQ_CONV_PARTIAL + partialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, outputScaleArray, \ + accTile, outTile, param); +#else + partialConvolved3D_S_MxN_S16S16I16_MOD_DWH_contiguous_depth(inTile, coeffTile, biasArray, \ + accTile, outTile, param); +#endif + + return(XAI_ERROR_STATUS()); + } + + /* Getting parameters from the tile structures */ + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); + const int32_t numOutCh = XAI_TILE3D_GET_DIM1(outTile); + const int32_t outWidth = XAI_TILE3D_GET_DIM2(outTile); + const int32_t outHeight = XAI_TILE3D_GET_DIM3(outTile); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + const int32_t coeffDataPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffDataPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffDataPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + const int32_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + + /* Convolution params */ + const uint8_t packShiftAccU = XAI_CNN_CONV_GET_ACCUM_SHIFT(param); + const uint8_t outShiftU = XAI_CNN_CONV_GET_OUTPUT_SHIFT(param); + const uint8_t enableReLu = XAI_CNN_CONV_GET_FLAG_RELU(param); + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + const uint8_t leftEdgeFlag = XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param); + const uint8_t topEdgeFlag = XAI_CNN_CONV_GET_FLAG_TOPEDGE(param); + const uint8_t inputFlag = XAI_CNN_CONV_GET_FLAG_INPUT(param); + const uint8_t outputFlag = XAI_CNN_CONV_GET_FLAG_OUTPUT(param); + +#ifdef DILATED_VQ_CONV_PARTIAL + const uint16_t *pOutputScaleData = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outputScaleArray); +#else + const uint16_t outScale = XAI_CNN_CONV_GET_OUTPUT_SCALE(param); +#endif + + /* Data Pointers of input, coefficient, biasData */ + const int16_t *pInData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + const int16_t *pCoeffData = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + const int64_t *pBiasData = (int64_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + /* Data Pointers of output and scratch buffer data */ + int16_t *pOutData = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t *pAccData = NULL; + + int32_t accDataPitch1 = 0; + int32_t accDataPitch2 = 0; + + if (!(XAI_CNN_CONV_GET_FLAG_INPUT(param) && XAI_CNN_CONV_GET_FLAG_OUTPUT(param))) + { + pAccData = (int64_t *) XAI_TILE3D_GET_DATA_PTR(accTile); + accDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(accTile); + accDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(accTile); + } + + int32_t dilatedKWidthU = dilationU * (kWidthU - 1) + 1; + int32_t dilatedKHeightU = dilationU * (kHeightU - 1) + 1; + int32_t leftEdge, topEdge; + + if ((dilatedKWidthU % 2) != 0) + { + leftEdge = dilatedKWidthU / 2; + } + else + { + leftEdge = leftEdgeFlag ? (dilatedKWidthU / 2) : ((dilatedKWidthU / 2) - 1); + } + + if ((dilatedKHeightU % 2) != 0) + { + topEdge = dilatedKHeightU / 2; + } + else + { + topEdge = topEdgeFlag ? (dilatedKHeightU / 2) : ((dilatedKHeightU / 2) - 1); + } + + /* move to start of edge data only when input is already padded. */ + pInData = &pInData[-(int32_t) ((topEdge) * inDataPitch2 + (leftEdge) * inDataPitch1)]; + + /* Setting the limits for output data according to ReLu is enabled or not*/ + int32_t minLim, maxLim; + + if (enableReLu) + { + minLim = XAI_CNN_CONV_GET_RELU_MIN(param); + maxLim = XAI_CNN_CONV_GET_RELU_MAX(param); + } + else + { + minLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MIN : 0); + maxLim = (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16) ? SHRT_MAX : USHRT_MAX); + } + + int32_t inCh, outCh, x, y, k; + + xb_vecN_2x32v *restrict phvecIn1; + xb_vecN_2x32v *restrict phvecIn2; + xb_vecN_2x32v *restrict phvecIn3; + xb_vecN_2x32v *restrict phvecIn4; + xb_vecNx16 *restrict pvecCoeff; + xb_vec2Nx8 *restrict pdvecBias; + xb_vec2Nx8 *restrict pdvecAccData; + xb_vecNx16 *restrict pvecOut; + + xb_vecNx48 vecAcc1 = 0, vecAcc2 = 0, vecAcc3 = 0, vecAcc4 = 0, vecBias = 0; + xb_vecN_2x32v hvecIn1, hvecIn2, hvecIn3, hvecIn4; + xb_vecNx16 vecCoeff1, vecCoeff2; + xb_vecNx16 vecOut1, vecOut2, vecOut3, vecOut4; + xb_vec2Nx8 dvecAccLL, dvecAccLH, dvecAccHL, dvecAccHH; + + valign vaIn1, vaIn2, vaIn3, vaIn4, vaBias, vaAcc; + +#ifdef DILATED_VQ_CONV_PARTIAL + xb_vecNx16U vecOutScaleU; + xb_vecNx16U *restrict pvecOutScaleData = (xb_vecNx16U *) (pOutputScaleData); + valign vaScale = IVP_LANX16U_PP(pvecOutScaleData); +#endif + + pdvecBias = (xb_vec2Nx8 *) (pBiasData); + vaBias = IVP_LA2NX8_PP(pdvecBias); + valign vaOut = IVP_ZALIGN(); + + for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH) + { + int32_t remOutCh = (numOutCh - outCh); + /* Initially the accumulators with the 48-bit bias values */ + if (inputFlag) // Biases will be loaded only when "inputFlag" is set + { + ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, remOutCh, vecBias); + } + +#ifdef DILATED_VQ_CONV_PARTIAL + IVP_LAVNX16U_XP(vecOutScaleU, vaScale, pvecOutScaleData, 2 * remOutCh); +#endif + + for (y = 0; y < outHeight; y += 2) + { + // Calculating "remY" for integrated tail-handling purpose + int32_t remY = XT_MIN(1, outHeight - y - 1); + for (x = 0; x < outWidth; x += 2) + { + // Calculating "remX" for integrated tail-handling purpose + int32_t remX = XT_MIN(1, outWidth - x - 1); + int16_t *pData1 = (int16_t *) (pInData + (x * strideX * inDataPitch1) + (y * strideY * inDataPitch2)); + int64_t *pAcc = (int64_t *) (pAccData + outCh + (x * accDataPitch1) + (y * accDataPitch2)); + + if (inputFlag) // if "inputFlag" is set, then initialize the accumulators with the bias values + { + /* Initializing all the 4 accumulators with bias values before accumulating for every spatial location */ + vecAcc4 = vecAcc3 = vecAcc2 = vecAcc1 = vecBias; + } + else // if "inputFlag" is not-set, then initialize the accumulators with the values stored in the accTile + { + // Loading accumulated values from W = 0, H = 0 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc1 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc1, dvecAccHH, dvecAccHL); + + // Loading accumulated values form W = 1, H = 0 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1)); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc2 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc2, dvecAccHH, dvecAccHL); + + // Loading accumulated values form W = 0, H = 1 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2)); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc3 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc3, dvecAccHH, dvecAccHL); + + // Loading accumulated values form W = 1, H = 1 spatial location initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2)); + vaAcc = IVP_LA2NX8_PP(pdvecAccData); + IVP_LAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_LAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + vecAcc4 = IVP_CVT48UN_2X64L(dvecAccLH, dvecAccLL); + IVP_CVT48UN_2X64H(vecAcc4, dvecAccHH, dvecAccHL); + } + + for (k = 0; k < kWidthU * kHeightU; k++) + { + // Adjusting the coefficient data pointer + pvecCoeff = (xb_vecNx16 *) (pCoeffData + outCh + ((k % kWidthU) * coeffDataPitch2) + ((k / kWidthU) * coeffDataPitch3)); + int16_t *pData2 = (int16_t *) (pData1 + (((k % kWidthU) * dilationU) * inDataPitch1) + (((k / kWidthU) * dilationU) * inDataPitch2)); + // phvecIn1 initially points to W = 0, H = 0 spatial location + phvecIn1 = (xb_vecN_2x32v *) (pData2); + // phvecIn2 initially points to W = 1, H = 0 spatial location + phvecIn2 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1)); + // phvecIn3 initially points to W = 0, H = 1 spatial location + phvecIn3 = (xb_vecN_2x32v *) (pData2 + (remY * strideY * inDataPitch2)); + // phvecIn4 initially points to W = 1, H = 1 spatial location + phvecIn4 = (xb_vecN_2x32v *) (pData2 + (remX * strideX * inDataPitch1) + (remY * strideY * inDataPitch2)); + + vaIn1 = IVP_LAN_2X32_PP(phvecIn1); + vaIn2 = IVP_LAN_2X32_PP(phvecIn2); + vaIn3 = IVP_LAN_2X32_PP(phvecIn3); + vaIn4 = IVP_LAN_2X32_PP(phvecIn4); + + for (inCh = 0; inCh < (numInCh - 1); inCh += 2) + { + // hvecIn1 contains 4 bytes or 2 elements along D from W = 0, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 4); + // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 4); + // hvecIn2 contains 4 bytes or 2 elements along D from W = 0, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 4); + // hvecIn2 contains 4 bytes or 2 elements along D from W = 1, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 4); + + // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0) initially + IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1)); + // vecCoeff2 contains 64 bytes or 32 elements along output depth (N) from next input depth (D = 1) initially + IVP_L2UNX16_XP(vecCoeff2, pvecCoeff, (2 * coeffDataPitch1)); + + // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc1, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0)); + // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc2, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0)); + // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc3, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0)); + // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc4, vecCoeff2, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0)); + } // End of for (inCh = 0; inCh < numInCh; inCh += 2) + + if (inCh < numInCh) + { + // hvecIn1 contains 2 bytes or 1 element along D from W = 0, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn1, vaIn1, phvecIn1, 2); + // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 0 spatial location initially + IVP_LAVN_2X32_XP(hvecIn2, vaIn2, phvecIn2, 2); + // hvecIn2 contains 2 bytes or 1 element along D from W = 0, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn3, vaIn3, phvecIn3, 2); + // hvecIn2 contains 2 bytes or 1 element along D from W = 1, H = 1 spatial location initially + IVP_LAVN_2X32_XP(hvecIn4, vaIn4, phvecIn4, 2); + + // vecCoeff1 contains 64 bytes or 32 elements along output depth (N) from initial input depth (D = 0) + IVP_L2UNX16_XP(vecCoeff1, pvecCoeff, (2 * coeffDataPitch1)); + + // vecAcc1 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc1, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn1, 0)); + // vecAcc2 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 0 spatial location initially + IVP_MULPAN16XR16(vecAcc2, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn2, 0)); + // vecAcc3 contains 64 bytes or 32 elements along output depth (N) from W = 0, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc3, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn3, 0)); + // vecAcc4 contains 64 bytes or 32 elements along output depth (N) from W = 1, H = 1 spatial location initially + IVP_MULPAN16XR16(vecAcc4, 0, vecCoeff1, IVP_EXTRN_2X32(hvecIn4, 0)); + } + } // End of for (k = 0; k < kWidthU * kHeightU; k++) + + if (outputFlag) // if "outputFlag" is set, apply pack, scale, shift, clamp logic on accumulated values and store the output + { + /* Pack, scale, shift, clamp logic to follow */ +#ifdef DILATED_VQ_CONV_PARTIAL + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut1, vecAcc1, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut2, vecAcc2, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut3, vecAcc3, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut4, vecAcc4, packShiftAccU, vecOutScaleU, outShiftU, minLim, maxLim); +#else + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut1, vecAcc1, packShiftAccU, outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut2, vecAcc2, packShiftAccU, outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut3, vecAcc3, packShiftAccU, outScale, outShiftU, minLim, maxLim); + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut4, vecAcc4, packShiftAccU, outScale, outShiftU, minLim, maxLim); +#endif + // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 0 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + (y * outDataPitch2)); + IVP_SAVNX16_XP(vecOut1, vaOut, pvecOut, (2 * remOutCh)); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + + // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 0 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + (y * outDataPitch2)); + IVP_SAVNX16_XP(vecOut2, vaOut, pvecOut, (2 * remOutCh) * remX); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + + // Storing 64 bytes or 32 elements along output depth (N) at W = 0, H = 1 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + (x * outDataPitch1) + ((y + remY) * outDataPitch2)); + IVP_SAVNX16_XP(vecOut3, vaOut, pvecOut, (2 * remOutCh) * remY); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + + // Storing 64 bytes or 32 elements along output depth (N) at W = 1, H = 1 initially + pvecOut = (xb_vecNx16 *) (pOutData + outCh + ((x + remX) * outDataPitch1) + ((y + remY) * outDataPitch2)); + IVP_SAVNX16_XP(vecOut4, vaOut, pvecOut, (2 * remOutCh) * remX * remY); + IVP_SAPOSNX16_FP(vaOut, pvecOut); + } + else // if "outputFlag" is not-set, store the accumulated values to the accTile + { + vaAcc = IVP_ZALIGN(); + dvecAccLL = IVP_CVT64SNX48LL(vecAcc1); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc1); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc1); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc1); + // Storing 32 elements at W = 0, H = 0 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + + dvecAccLL = IVP_CVT64SNX48LL(vecAcc2); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc2); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc2); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc2); + // Storing 32 elements at W = 1, H = 0 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1)); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + + dvecAccLL = IVP_CVT64SNX48LL(vecAcc3); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc3); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc3); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc3); + // Storing 32 elements at W = 0, H = 1 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remY * accDataPitch2)); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + + dvecAccLL = IVP_CVT64SNX48LL(vecAcc4); + dvecAccLH = IVP_CVT64SNX48LH(vecAcc4); + dvecAccHL = IVP_CVT64SNX48HL(vecAcc4); + dvecAccHH = IVP_CVT64SNX48HH(vecAcc4); + // Storing 32 elements at W = 1, H = 1 initially + pdvecAccData = (xb_vec2Nx8 *) (pAcc + (remX * accDataPitch1) + (remY * accDataPitch2)); + IVP_SAV2NX8_XP(dvecAccLL, vaAcc, pdvecAccData, (8 * remOutCh)); + IVP_SAV2NX8_XP(dvecAccLH, vaAcc, pdvecAccData, (8 * remOutCh) - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHL, vaAcc, pdvecAccData, (8 * remOutCh) - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAV2NX8_XP(dvecAccHH, vaAcc, pdvecAccData, (8 * remOutCh) - (6 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAPOS2NX8_FP(vaAcc, pdvecAccData); + } + } // End of for (x = 0; x < outWidth; x += 2) + } // End of for (y = 0; y < outHeight; y += 2) + } // End of for (outCh = 0; outCh < numOutCh; outCh += XCHAL_IVPN_SIMD_WIDTH) + + return(XAI_ERROR_STATUS()); +} +#endif // #if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h new file mode 100644 index 00000000000..fe3bb154328 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_extend_edge.h @@ -0,0 +1,1517 @@ +/* + * Copyright (c) 2022 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT) name ## _ ## MORPH_FNAME_SPECIFIER_IDT +#define MAKE_NAME_IMPL_1(name, MORPH_FNAME_SPECIFIER_IDT, dataOrder) name ## _ ## MORPH_FNAME_SPECIFIER_IDT ## _ ## dataOrder + +#if INPUT_DATA_TYPE == INTEGER8BIT + +#undef MAKE_ARGUMENTS +#undef MAKE_ARGUMENTS2 +#undef MAKE_NAME +#undef MAKE_NAME_1 +#undef MORPH_OP_FUNCTION +#undef MORPH_OP_FUNCTION_CONST +#undef MORPH_IDT_CHECK +#undef MORPH_ADT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_FILLTILE +#undef MORPH_OP_LOAD +#undef MORPH_OP_AND +#undef MORPH_OP_SEQ +#undef MORPH_OP_SEL +#undef MORPH_OP_STORE +#undef MORPH_OP_PRIME +#undef MORPH_IDT_VEC +#undef MORPH_OP_FLUSH +#undef MORPH_VECTORIZATIONWIDTH + +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const int32_t b, xai_size3D c) +#define MAKE_ARGUMENTS2(a, b, c) (xai_pTile3D a, const int8_t * b, xai_size3D c) +#define MORPH_OP_FUNCTION extendWHEdges3D_I8 +#define MORPH_OP_FUNCTION_CONST extendEdgesConst3D_I8 +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, I8) +#define MAKE_NAME_1(name, dataOrder) MAKE_NAME_IMPL_1(name, I8, dataOrder) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_I8 +#define MORPH_ADT_CHECK XAI_CHECK_ARRAY_I8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_FILLTILE xaiFillTile3D_I8 +#define MORPH_OP_LOAD IVP_LAV2NX8_XP +#define MORPH_OP_AND IVP_AND2NX8 +#define MORPH_OP_SEQ IVP_SEQ2NX8 +#define MORPH_OP_SEL IVP_SEL2NX8 +#define MORPH_OP_STORE IVP_SAV2NX8_XP +#define MORPH_OP_PRIME IVP_LA2NX8_PP +#define MORPH_IDT_VEC xb_vec2Nx8 +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP +#define MORPH_VECTORIZATIONWIDTH 2 * XCHAL_IVPN_SIMD_WIDTH + +#elif INPUT_DATA_TYPE == INTEGER16BIT + +#undef MAKE_ARGUMENTS +#undef MAKE_ARGUMENTS2 +#undef MAKE_NAME +#undef MAKE_NAME_1 +#undef MORPH_OP_FUNCTION +#undef MORPH_OP_FUNCTION_CONST +#undef MORPH_IDT_CHECK +#undef MORPH_ADT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_FILLTILE +#undef MORPH_OP_LOAD +#undef MORPH_OP_AND +#undef MORPH_OP_SEQ +#undef MORPH_OP_SEL +#undef MORPH_OP_STORE +#undef MORPH_OP_PRIME +#undef MORPH_IDT_VEC +#undef MORPH_OP_FLUSH +#undef MORPH_VECTORIZATIONWIDTH + +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const int32_t b, xai_size3D c) +#define MAKE_ARGUMENTS2(a, b, c) (xai_pTile3D a, const int16_t * b, xai_size3D c) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, I16) +#define MAKE_NAME_1(name, dataOrder) MAKE_NAME_IMPL_1(name, I16, dataOrder) +#define MORPH_OP_FUNCTION extendWHEdges3D_I16 +#define MORPH_OP_FUNCTION_CONST extendEdgesConst3D_I16 +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_I16 +#define MORPH_ADT_CHECK XAI_CHECK_ARRAY_I16 +#define MORPH_IDT_SCALAR int16_t +#define MORPH_IDT_FILLTILE xaiFillTile3D_I16 +#define MORPH_OP_LOAD IVP_LAVNX16_XP +#define MORPH_OP_AND IVP_ANDNX16 +#define MORPH_OP_SEQ IVP_SEQNX16 +#define MORPH_OP_SEL IVP_SELNX16 +#define MORPH_OP_STORE IVP_SAVNX16_XP +#define MORPH_OP_PRIME IVP_LANX16_PP +#define MORPH_IDT_VEC xb_vecNx16 +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH + +#elif INPUT_DATA_TYPE == FLOAT16BIT +#undef MAKE_ARGUMENTS +#undef MAKE_ARGUMENTS2 +#undef MAKE_NAME +#undef MAKE_NAME_1 +#undef MORPH_OP_FUNCTION +#undef MORPH_OP_FUNCTION_CONST +#undef MORPH_IDT_CHECK +#undef MORPH_ADT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_FILLTILE +#undef MORPH_OP_LOAD +#undef MORPH_OP_STORE +#undef MORPH_OP_PRIME +#undef MORPH_IDT_VEC +#undef MORPH_OP_FLUSH +#undef MORPH_VECTORIZATIONWIDTH + +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const xb_f16 b, xai_size3D c) +#define MAKE_ARGUMENTS2(a, b, c) (xai_pTile3D a, const xb_f16 * b, xai_size3D c) +#define MORPH_OP_FUNCTION extendWHEdges3D_F16 +#define MORPH_OP_FUNCTION_CONST extendEdgesConst3D_F16 +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, F16) +#define MAKE_NAME_1(name, dataOrder) MAKE_NAME_IMPL_1(name, F16, dataOrder) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_ADT_CHECK XAI_CHECK_ARRAY_F16 +#define MORPH_IDT_SCALAR xb_f16 +#define MORPH_IDT_FILLTILE xaiFillTile3D_F16 +#define MORPH_OP_LOAD IVP_LAVNXF16_XP +#define MORPH_OP_STORE IVP_SAVNXF16_XP +#define MORPH_OP_PRIME IVP_LANXF16_PP +#define MORPH_IDT_VEC xb_vecNxf16 +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#endif + +#elif INPUT_DATA_TYPE == FLOAT32BIT +#undef MAKE_ARGUMENTS +#undef MAKE_ARGUMENTS2 +#undef MAKE_NAME +#undef MAKE_NAME_1 +#undef MORPH_OP_FUNCTION +#undef MORPH_OP_FUNCTION_CONST +#undef MORPH_IDT_CHECK +#undef MORPH_ADT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_FILLTILE +#undef MORPH_OP_LOAD +#undef MORPH_OP_STORE +#undef MORPH_OP_PRIME +#undef MORPH_IDT_VEC +#undef MORPH_OP_FLUSH +#undef MORPH_VECTORIZATIONWIDTH + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const float b, xai_size3D c) +#define MAKE_ARGUMENTS2(a, b, c) (xai_pTile3D a, const float * b, xai_size3D c) +#define MORPH_OP_FUNCTION extendWHEdges3D_F32 +#define MORPH_OP_FUNCTION_CONST extendEdgesConst3D_F32 +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, F32) +#define MAKE_NAME_1(name, dataOrder) MAKE_NAME_IMPL_1(name, F32, dataOrder) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_ADT_CHECK XAI_CHECK_ARRAY_F32 +#define MORPH_IDT_SCALAR float +#define MORPH_IDT_FILLTILE xaiFillTile3D_F32 +#define MORPH_OP_LOAD IVP_LAVN_2XF32_XP +#define MORPH_OP_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_PRIME IVP_LAN_2XF32_PP +#define MORPH_IDT_VEC xb_vecN_2xf32 +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#endif +#endif + + +/*====================================================================================*/ +/*============= START of xaiExtendEdgesConst3D_* routines ============================*/ +/*====================================================================================*/ + +/*************************** extendEdgesConst3D_I8 *************************/ +/*************************** extendEdgesConst3D_I16 *************************/ +/*************************** extendEdgesConst3D_F16 *************************/ +/*************************** extendEdgesConst3D_F32 *************************/ +/* Description : P6 implementation for extending the edges of a 3D tile */ +/* with a constant value. This function extends edges across */ +/* dimension 1 & dimension2 of a 3D tile */ +/* Inputs : constant value to fill the edges */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/* Assumptions : dstData is signed 8/16 bit Interger or half precision */ +/* float(FP16) or single precision float(FP32) */ +/* based on MORPH specifier. */ +/****************************************************************************/ +static _XAI_INLINE_ void MAKE_NAME(extendEdgesConst3D) MAKE_ARGUMENTS(dstTile, constValue, frame3DSize) +{ + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(dstTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(dstTile); + const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile); + const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile); + const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile); + const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile); + int32_t dim3Size = XAI_TILE3D_GET_DIM3(dstTile); + + const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile); + const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile); + int32_t frame_dim1 = frame3DSize.dim1Size; + int32_t frame_dim2 = frame3DSize.dim2Size; + int32_t dim1ExtendEdgeSize = dim1Size + dim1Edge1 + dim1Edge2; + int32_t dim2ExtendEdgeSize = (dim2Size + dim2Edge1 + dim2Edge2) * dstDataPitch1; + + int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile); + int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile); + + MORPH_IDT_SCALAR *restrict pDst3D = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile); + int32_t ixmin = MAX2(start_x - dim1Edge1, 0); + int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1); + int32_t iymin = MAX2(start_y - dim2Edge1, 0); + int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1); + + int x, y, z; /* Loop variables */ + const MORPH_IDT_SCALAR value = constValue; + + // horizontal top + int32_t horTopXcord = -dim1Edge1; + int32_t horTopYcord = -dim2Edge1; + int32_t horTopWidth = dim1Size + dim1Edge1 + dim1Edge2; + int32_t horTopHeight = iymin - (start_y - dim2Edge1); + + // horizontal bottom + int32_t horBottomXcord = -dim1Edge1; + int32_t horBottomYcord = iymax + 1 - start_y; + int32_t horBottomWidth = dim1Size + dim1Edge1 + dim1Edge2; + int32_t horBottomHeight = start_y + dim2Size + dim2Edge2 - 1 - iymax; + + // vertical left + int32_t verLeftXcord = -dim1Edge1; + int32_t verLeftYcord = horTopYcord + horTopHeight; + int32_t verLeftWidth = ixmin - (start_x - dim1Edge1); + int32_t verLeftHeight = iymax - iymin + 1; + + // vertical right + int32_t verRightXcord = ixmax + 1 - start_x; + int32_t verRightYcord = horTopYcord + horTopHeight; + int32_t verRightWidth = start_x + dim1Size + dim1Edge2 - 1 - ixmax; + int32_t verRightHeight = iymax - iymin + 1; + + valign vaOutData1 = IVP_ZALIGN(); + + MORPH_IDT_VEC *restrict pdvecOut1, *restrict pdvecOut2; + MORPH_IDT_SCALAR *restrict pDst1, *restrict pDst2; + /* Most optimal case is when - + i. dim1 (including edges) has no extra padding + ii. Each plane, i.e. dim1 * dim2 (including edges in both dimensions) has no extra padding + */ + if ((dstDataPitch1 == dim1ExtendEdgeSize) && (dstDataPitch2 == dim2ExtendEdgeSize)) + { + int numIter = horTopWidth * horTopHeight + horBottomWidth * horBottomHeight; + + // horizontal top first(z = 0) plane + if (horTopHeight > 0) + { + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1); + for (x = 0; x < horTopWidth * horTopHeight; x += MORPH_VECTORIZATIONWIDTH) + { + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, + sizeof(MORPH_IDT_SCALAR) * (horTopWidth * horTopHeight - x)); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + } + } //if( horTopHeight > 0) + z = 0; + if (dim3Size > 1) + { + for (; z < dim3Size - 1; z++) // In one loop, "horizontal bottom z plane" and "horizontal top (z + 1)" plane is covered + { + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1); + for (x = 0; x < numIter; x += MORPH_VECTORIZATIONWIDTH) + { + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, + sizeof(MORPH_IDT_SCALAR) * (numIter - x)); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + } + } + } + + // horizontal bottom last(z = dim3Size - 1) plane + if (horBottomHeight > 0) + { + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1); + for (x = 0; x < horBottomWidth * horBottomHeight; x += MORPH_VECTORIZATIONWIDTH) + { + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, + sizeof(MORPH_IDT_SCALAR) * (horBottomWidth * horBottomHeight - x)); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + } + } + } + else + { + for (z = 0; z < dim3Size; z += 2) + { + int32_t remZ = XT_SALT(1, dim3Size - z); //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0 + + // horizontal top + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + if (horTopHeight > 0) + { + for (x = 0; x < horTopWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((horTopWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < horTopHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + } //if( horTopHeight > 0) + + // horizontal bottom + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + if (horBottomHeight > 0) + { + for (x = 0; x < horBottomWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((horBottomWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < horBottomHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + } + } + } + + for (z = 0; z < dim3Size; z += 2) + { + int remZ = XT_SALT(1, dim3Size - z); //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0 + + // vertical left + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((verLeftYcord * dstDataPitch1) + verLeftXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((verLeftYcord * dstDataPitch1) + verLeftXcord); + + for (x = 0; x < verLeftWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((verLeftWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < verLeftHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + + // vertical right + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((verRightYcord * dstDataPitch1) + verRightXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((verRightYcord * dstDataPitch1) + verRightXcord); + + for (x = 0; x < verRightWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((verRightWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < verRightHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + } +} + +/************************** xaiExtendEdgesConst3D_I8 ********************************/ +/************************** xaiExtendEdgesConst3D_I16 *******************************/ +/************************** xaiExtendEdgesConst3D_F16 *******************************/ +/************************** xaiExtendEdgesConst3D_F32 *******************************/ +/* Description : P6 optimized generic implementation of xaiExtendEdgesConst 3D */ +/* function. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method implements xaiExtendEdgesConst_I8, xaiExtendEdgesConst_I16 */ +/* xaiExtendEdgesConst3D_F16 & xaiExtendEdgesConst3D_F32 functionality*/ +/* Inputs : constant value to fill the edges */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/* Assumptions : OutData is signed 8/16 bit Interger or half precision float(FP16) */ +/* single precision float(FP32) based on MORPH specifier */ +/************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME(xaiExtendEdgesConst3D) MAKE_ARGUMENTS(dstTile, value, frame3DSize) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(dstTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile); + XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) && \ + (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE, \ + "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions should be greater than 0", \ + frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size); + } + + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(dstTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(dstTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(dstTile); + const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile); + const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile); + const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile); + const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile); + const int32_t dim3Edge1 = XAI_TILE3D_GET_DIM3_EDGE1(dstTile); + const int32_t dim3Edge2 = XAI_TILE3D_GET_DIM3_EDGE2(dstTile); + const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile); + const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile); + + MORPH_IDT_SCALAR *pDst = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile); + + int32_t frame_dim1 = frame3DSize.dim1Size; + int32_t frame_dim2 = frame3DSize.dim2Size; + int32_t frame_dim3 = frame3DSize.dim3Size; + int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile); + int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile); + int32_t start_z = XAI_TILE3D_GET_DIM3_COORD(dstTile); + + int32_t ixmin = MAX2(start_x - dim1Edge1, 0); + int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1); + int32_t iymin = MAX2(start_y - dim2Edge1, 0); + int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1); + int32_t izmin = MAX2(start_z - dim3Edge1, 0); + int32_t izmax = MIN2(start_z + dim3Size + dim3Edge2 - 1, frame_dim3 - 1); + + /* nothing to extend, because tile and frame intersection is empty */ + if ((ixmin > ixmax) || (iymin > iymax) || (izmin > izmax)) + { + return(MORPH_IDT_FILLTILE(dstTile, value, 1)); + } + + /*******************************************************************************/ + /* P6 implementation of xaiExtendEdgesConst3D is split into 3 parts. */ + /* If pitch is equal to stride, memory location to be updated across 3rd */ + /* dimension edges is contiguous. Hence processing across edge can be */ + /* implemented using FillTile3D functionality. Processing across 3rd dimension */ + /* is split as front end and rear end processing. Processing across 3rd */ + /* dimension excluding the edge is implemented similar to 2D implementation of */ + /* ExtendEdges functionality. */ + /*******************************************************************************/ + + MORPH_IDT_SCALAR *pDst1; + + /* Number of 2D tiles to be processed across edge1 3rd dimension */ + int32_t dim3SizeFrontEnd = izmin - (start_z - dim3Edge1); + /* Offset calculation for Extend Edge across 3rd dimension excluding edges */ + int32_t dim3CordMiddle = izmin - start_z; + /* Number of 2D tiles to be processed across 3rd dimension excluding edges */ + int32_t dim3SizeMiddle = izmax - izmin + 1; + /* Offset calculation for Extend Edge across edge 2 3rd */ + int32_t dim3CordRearEnd = izmax + 1 - start_z; + /* Number of 2D tiles processing to Extend Edge across 3rd edge2 dimension */ + int32_t dim3SizeRearEnd = start_z + dim3Size + dim3Edge2 - 1 - izmax; + + /* Update local 3D tile structure with dstTile structure parameters. Local */ + /* 3D tile structure is used as parameter to implement fillTile functionality */ + xai_tile3D dst_t; + /* Update parameters for local 3D tile */ + XAI_TILE3D_SET_DIM1(&dst_t, dim1Size); + XAI_TILE3D_SET_DIM1_PITCH(&dst_t, dstDataPitch1); + XAI_TILE3D_SET_DIM1_EDGE1(&dst_t, dim1Edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&dst_t, dim1Edge2); + XAI_TILE3D_SET_DIM2(&dst_t, dim2Size); + XAI_TILE3D_SET_DIM2_PITCH(&dst_t, dstDataPitch2); + XAI_TILE3D_SET_DIM2_EDGE1(&dst_t, dim2Edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&dst_t, dim2Edge2); + XAI_TILE3D_SET_DIM3_EDGE1(&dst_t, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&dst_t, 0); + XAI_TILE3D_SET_DIM1_COORD(&dst_t, start_x); + XAI_TILE3D_SET_DIM2_COORD(&dst_t, start_y); + XAI_TILE3D_SET_DIM3_COORD(&dst_t, start_z); + XAI_TILE3D_SET_BUFF_PTR(&dst_t, XAI_TILE3D_GET_BUFF_PTR(dstTile)); + XAI_TILE3D_SET_BUFF_SIZE(&dst_t, XAI_TILE3D_GET_BUFF_SIZE(dstTile)); + XAI_TILE3D_SET_TYPE(&dst_t, XAI_TILE3D_GET_TYPE(dstTile)); + + /***********************************************************************************/ + /* Processing across the 3rd dimension edges (edge1 and edge2) */ + /* Processing across 3rd dimension edge 1 is referred as Front End Processing */ + /* Processing across 3rd dimension edge 2 is referred as Rear End Processing */ + /* Local copy of 3D tile is declared and updated with destination tile parameters. */ + /* Size parameter across third dimension is updated based on number of 2D tiles */ + /* to be processed across front and read end. In order to effectively use the */ + /* SIMD capabilities xaiFillTile3D implementation is utilized. */ + /***********************************************************************************/ + if (dim3SizeFrontEnd > 0) + { + /***********************************************************************************/ + /* Front end processing : Processing along the 3rd dimension edge 1. */ + /***********************************************************************************/ + + /* update destination data pointer */ + pDst1 = &pDst[((-dim3Edge1) * dstDataPitch2)]; + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1); + XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeFrontEnd); + MORPH_IDT_FILLTILE(&dst_t, value, 1); + } + if (dim3SizeRearEnd > 0) + { + /***********************************************************************************/ + /* Rear end processing : Processing along the 3rd dimension edge 2. */ + /***********************************************************************************/ + + /* update destination data pointer */ + pDst1 = &pDst[dim3CordRearEnd * dstDataPitch2]; + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1); + XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeRearEnd); + MORPH_IDT_FILLTILE(&dst_t, value, 1); + } + + /* Update destination data pointer */ + pDst1 = &pDst[(dim3CordMiddle * dstDataPitch2)]; + XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeMiddle); + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1); + + MORPH_OP_FUNCTION_CONST(&dst_t, value, frame3DSize); + return(XAI_ERROR_STATUS()); +} + + +/*====================================================================================*/ +/*============= END of xaiExtendEdgesConst3D_* routines ==============================*/ +/*====================================================================================*/ + + + + +/*====================================================================================*/ +/*============= START of xaiExtendEdges3D_* routines =================================*/ +/*====================================================================================*/ + +/************************** extendWHEdges3D_I8 *****************************/ +/************************** extendWHEdges3D_I16 *****************************/ +/************************** extendWHEdges3D_F16 *****************************/ +/************************** extendWHEdges3D_F32 *****************************/ +/* Description : P6 implementation for extending the edges of a 3D tile */ +/* by filling different edge values for different depths and */ +/* extends the edges along dimension 1(W) and dimension 2(H) */ +/* 3D tile */ +/* Inputs : pValue(array of edge values) */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/* Assumptions : dstData is signed 8/16 bit Interger or half precision */ +/* float(FP16) or single precision float(FP32) */ +/* based on MORPH specifier. */ +/****************************************************************************/ +static _XAI_INLINE_ void MAKE_NAME(extendWHEdges3D) MAKE_ARGUMENTS2(dstTile, pValue, frame3DSize) +{ + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(dstTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(dstTile); + const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile); + const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile); + const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile); + const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile); + int32_t dim3Size = XAI_TILE3D_GET_DIM3(dstTile); + + const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile); + const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile); + int32_t frame_dim1 = frame3DSize.dim1Size; + int32_t frame_dim2 = frame3DSize.dim2Size; + int32_t dim1ExtendEdgeSize = dim1Size + dim1Edge1 + dim1Edge2; + + int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile); + int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile); + + MORPH_IDT_SCALAR *restrict pDst3D = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile); + int32_t ixmin = MAX2(start_x - dim1Edge1, 0); + int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1); + int32_t iymin = MAX2(start_y - dim2Edge1, 0); + int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1); + + int x, y, z; /* Loop variables */ + + // horizontal top + int32_t horTopXcord = -dim1Edge1; + int32_t horTopYcord = -dim2Edge1; + int32_t horTopWidth = dim1Size + dim1Edge1 + dim1Edge2; + int32_t horTopHeight = iymin - (start_y - dim2Edge1); + + // horizontal bottom + int32_t horBottomXcord = -dim1Edge1; + int32_t horBottomYcord = iymax + 1 - start_y; + int32_t horBottomWidth = dim1Size + dim1Edge1 + dim1Edge2; + int32_t horBottomHeight = start_y + dim2Size + dim2Edge2 - 1 - iymax; + + // vertical left + int32_t verLeftXcord = -dim1Edge1; + int32_t verLeftYcord = horTopYcord + horTopHeight; + int32_t verLeftWidth = ixmin - (start_x - dim1Edge1); + int32_t verLeftHeight = iymax - iymin + 1; + + // vertical right + int32_t verRightXcord = ixmax + 1 - start_x; + int32_t verRightYcord = horTopYcord + horTopHeight; + int32_t verRightWidth = start_x + dim1Size + dim1Edge2 - 1 - ixmax; + int32_t verRightHeight = iymax - iymin + 1; + + valign vaOutData1 = IVP_ZALIGN(); + valign vaOutData2 = IVP_ZALIGN(); + + MORPH_IDT_VEC *restrict pdvecOut1, *restrict pdvecOut2; + MORPH_IDT_SCALAR *restrict pDst1, *restrict pDst2; + + if (dstDataPitch1 == dim1ExtendEdgeSize) + { + for (z = 0; z < dim3Size; z += 2) + { + int32_t remZ = XT_SALT(1, dim3Size - z); //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0 + + const MORPH_IDT_SCALAR value1 = pValue[z]; + const MORPH_IDT_SCALAR value2 = pValue[z + remZ]; + + // horizontal top + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + if (horTopHeight > 0) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2); + for (x = 0; x < horTopWidth * horTopHeight; x += MORPH_VECTORIZATIONWIDTH) + { + MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, + sizeof(MORPH_IDT_SCALAR) * (horTopWidth * horTopHeight - x)); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + + MORPH_OP_STORE(value2, vaOutData2, pdvecOut2, + sizeof(MORPH_IDT_SCALAR) * (horTopWidth * horTopHeight - x) * remZ); + MORPH_OP_FLUSH(vaOutData2, pdvecOut2); + } + } + + // horizontal bottom + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + if (horBottomHeight > 0) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2); + for (x = 0; x < horBottomWidth * horBottomHeight; x += MORPH_VECTORIZATIONWIDTH) + { + MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, + sizeof(MORPH_IDT_SCALAR) * (horBottomWidth * horBottomHeight - x)); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + + MORPH_OP_STORE(value2, vaOutData2, pdvecOut2, + sizeof(MORPH_IDT_SCALAR) * (horBottomWidth * horBottomHeight - x) * remZ); + MORPH_OP_FLUSH(vaOutData2, pdvecOut2); + } + } + } + } + else + { + for (z = 0; z < dim3Size; z += 2) + { + int32_t remZ = XT_SALT(1, dim3Size - z); //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0 + + const MORPH_IDT_SCALAR value1 = pValue[z]; + const MORPH_IDT_SCALAR value2 = pValue[z + remZ]; + + // horizontal top + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((horTopYcord * dstDataPitch1) + horTopXcord); + + if (horTopHeight > 0) + { + for (x = 0; x < horTopWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((horTopWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < horTopHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + } //if( horTopHeight > 0) + + // horizontal bottom + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((horBottomYcord * dstDataPitch1) + horBottomXcord); + + if (horBottomHeight > 0) + { + for (x = 0; x < horBottomWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((horBottomWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < horBottomHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + } + } + } + + + for (z = 0; z < dim3Size; z += 2) + { + int32_t remZ = XT_SALT(1, dim3Size - z); //remaining (dim3Size - z) greater than 1, then remZ = 1, else 0 + + const MORPH_IDT_SCALAR value1 = pValue[z]; + const MORPH_IDT_SCALAR value2 = pValue[z + remZ]; + + // vertical left + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((verLeftYcord * dstDataPitch1) + verLeftXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((verLeftYcord * dstDataPitch1) + verLeftXcord); + + for (x = 0; x < verLeftWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((verLeftWidth - x), MORPH_VECTORIZATIONWIDTH); + for (y = 0; y < verLeftHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + + // vertical right + pDst1 = (MORPH_IDT_SCALAR *) pDst3D + (z * dstDataPitch2) + \ + ((verRightYcord * dstDataPitch1) + verRightXcord); + pDst2 = (MORPH_IDT_SCALAR *) pDst3D + ((z + remZ) * dstDataPitch2) + \ + ((verRightYcord * dstDataPitch1) + verRightXcord); + + for (x = 0; x < verRightWidth; x += MORPH_VECTORIZATIONWIDTH) + { + int32_t remX = XT_MIN((verRightWidth - x), MORPH_VECTORIZATIONWIDTH); + + for (y = 0; y < verRightHeight; y++) + { + pdvecOut1 = (MORPH_IDT_VEC *) (pDst1 + (y * dstDataPitch1) + x); + pdvecOut2 = (MORPH_IDT_VEC *) (pDst2 + (y * dstDataPitch1) + x); + MORPH_OP_STORE(value1, vaOutData1, pdvecOut1, sizeof(MORPH_IDT_SCALAR) * remX); + MORPH_OP_FLUSH(vaOutData1, pdvecOut1); + MORPH_OP_STORE(value2, vaOutData1, pdvecOut2, sizeof(MORPH_IDT_SCALAR) * remX * remZ); + MORPH_OP_FLUSH(vaOutData1, pdvecOut2); + } + } + } +} + + +/***************************** extendEdges3D_I8_WHD ******************************/ +/***************************** extendEdges3D_I16_WHD *****************************/ +/***************************** extendEdges3D_F16_WHD *****************************/ +/***************************** extendEdges3D_F32_WHD *****************************/ +/* Description : P6 optimized generic implementation of xaiExtendEdgesConst 3D */ +/* function. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method implements extendEdges3D_I8_WHD, extendEdges3D_I16_WHD, */ +/* extendEdges3D_F16_WHD and extendEdges3D_F32_WHD functionality */ +/* Inputs : constant value to fill the edges */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/* Assumptions : OutData is signed/unsigned 8/16 bit Interger or */ +/* half precision float(FP16) or single precision float(FP32) */ +/* based on MORPH specifier */ +/*********************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME_1(extendEdges3D, WHD) (xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(dstTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(dstTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(dstTile); + const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile); + const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile); + const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile); + const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile); + const int32_t dim3Edge1 = XAI_TILE3D_GET_DIM3_EDGE1(dstTile); + const int32_t dim3Edge2 = XAI_TILE3D_GET_DIM3_EDGE2(dstTile); + const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile); + const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile); + + int32_t frame_dim1 = frame3DSize.dim1Size; + int32_t frame_dim2 = frame3DSize.dim2Size; + int32_t frame_dim3 = frame3DSize.dim3Size; + int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile); + int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile); + int32_t start_z = XAI_TILE3D_GET_DIM3_COORD(dstTile); + + int32_t ixmin = MAX2(start_x - dim1Edge1, 0); + int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1); + int32_t iymin = MAX2(start_y - dim2Edge1, 0); + int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1); + int32_t izmin = MAX2(start_z - dim3Edge1, 0); + int32_t izmax = MIN2(start_z + dim3Size + dim3Edge2 - 1, frame_dim3 - 1); + + /* Update local 3D tile structure with dstTile structure parameters. Local */ + /* 3D tile structure is used as parameter to implement fillTile functionality */ + xai_tile3D dst_t; + XAI_TILE3D_SET_DIM1(&dst_t, dim1Size); + XAI_TILE3D_SET_DIM1_PITCH(&dst_t, dstDataPitch1); + XAI_TILE3D_SET_DIM1_EDGE1(&dst_t, dim1Edge1); + XAI_TILE3D_SET_DIM1_EDGE2(&dst_t, dim1Edge2); + XAI_TILE3D_SET_DIM2(&dst_t, dim2Size); + XAI_TILE3D_SET_DIM2_PITCH(&dst_t, dstDataPitch2); + XAI_TILE3D_SET_DIM2_EDGE1(&dst_t, dim2Edge1); + XAI_TILE3D_SET_DIM2_EDGE2(&dst_t, dim2Edge2); + XAI_TILE3D_SET_DIM3_EDGE1(&dst_t, 0); + XAI_TILE3D_SET_DIM3_EDGE2(&dst_t, 0); + XAI_TILE3D_SET_DIM1_COORD(&dst_t, start_x); + XAI_TILE3D_SET_DIM2_COORD(&dst_t, start_y); + XAI_TILE3D_SET_DIM3_COORD(&dst_t, start_z); + XAI_TILE3D_SET_BUFF_PTR(&dst_t, XAI_TILE3D_GET_BUFF_PTR(dstTile)); + XAI_TILE3D_SET_BUFF_SIZE(&dst_t, XAI_TILE3D_GET_BUFF_SIZE(dstTile)); + XAI_TILE3D_SET_TYPE(&dst_t, XAI_TILE3D_GET_TYPE(dstTile)); + + MORPH_IDT_SCALAR *pDst = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile); + const MORPH_IDT_SCALAR *pValue = (MORPH_IDT_SCALAR *) XAI_ARRAY_GET_DATA_PTR(pArray); + int32_t z; /* Loop variable */ + MORPH_IDT_SCALAR *pDst1; + MORPH_IDT_SCALAR value; + + /* Validation for Tile and Frame intersection */ + int32_t frameIntersectionFlag = ((ixmin > ixmax) || (iymin > iymax) || (izmin > izmax)); + + /*********************************************************************************/ + /* P6 implementation of xaiExtendEdges3D is similar to xaiExtendEdgesConst3D */ + /* implementation. In ExtendEdges functionality a unique value is used to */ + /* xaiExtendEdges, in xaiExtendEdges3D implementation each 2D tile is filled */ + /* with a value from xai_array, index by the co-ordinate position across third */ + /* dimension. In xaiExtendEdges3D implementation processing across 3rd */ + /* dimension edges, extendEdges need to perform for the entire 2D tile. */ + /* xaiExtendEdges3D processing is split into 3 parts. ExtendEdges processing */ + /* across 3rd dimension edges is split as front end and rear end processing. */ + /* Processing across 3rd dimension excluding the edge is implemented similar to */ + /* 2D implementation of extendEdges functionality. */ + /*********************************************************************************/ + + if (frameIntersectionFlag) + { + /* If frameIntersectionFlag is enabled the tile exists outside frame boundary */ + /* and ExtendEdges need to be done on the entire 3D tile. */ + + const int32_t dim3FillSize = dim3Size + dim3Edge1 + dim3Edge2; + pDst1 = &pDst[((-dim3Edge1) * dstDataPitch2)]; + for (z = 0; z < dim3FillSize; z++) /* Loop across dim3 */ + { + value = pValue[z]; + /* update destination data pointer */ + MORPH_IDT_SCALAR *pDst2 = pDst1 + (z * dstDataPitch2); + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst2); + XAI_TILE3D_SET_DIM3(&dst_t, 1); + MORPH_IDT_FILLTILE(&dst_t, value, 1); + } + return; + } + + /* Number of 2D tiles to be processed across edge1 3rd dimension */ + int32_t dim3SizeFrontEnd = izmin - (start_z - dim3Edge1); + /* Offset calculation for Extend Edge across 3rd dimension excluding edges */ + int32_t dim3CordMiddle = izmin - start_z; + /* Number of 2D tiles to be processed across 3rd dimension excluding edges */ + int32_t dim3SizeMiddle = izmax - izmin + 1; + /* Offset calculation for Extend Edge across edge 2 3rd */ + int32_t dim3CordRearEnd = izmax + 1 - start_z; + /* Number of 2D tiles processing to Extend Edge across 3rd edge2 dimension */ + int32_t dim3SizeRearEnd = start_z + dim3Size + dim3Edge2 - 1 - izmax; + + /***********************************************************************************/ + /* Processing across the 3rd dimension edges (edge1 and edge2) */ + /* Processing across 3rd dimension edge 1 is referred as Front End Processing */ + /* Processing across 3rd dimension edge 2 is referred as Rear End Processing */ + /* Local copy of 3D tile is declared and updated with destination tile parameters. */ + /* Size parameter across third dimension is updated based on number of 2D tiles */ + /* to be processed across front and read end. In order to effectively use the */ + /* SIMD capabilities xaiFillTile3D implementation is utilized. */ + /***********************************************************************************/ + + if (dim3SizeFrontEnd > 0) + { + /***********************************************************************************/ + /* Front end processing : Processing along the 3rd dimension edge 1. */ + /***********************************************************************************/ + + /* Update destination data pointer */ + pDst1 = &pDst[((-dim3Edge1) * dstDataPitch2)]; + XAI_TILE3D_SET_DIM3(&dst_t, 1); + for (z = 0; z < dim3SizeFrontEnd; z++) /* Loop across dim3 */ + { + value = pValue[z]; + /* update destination data pointer */ + MORPH_IDT_SCALAR *pDst2 = pDst1 + (z * dstDataPitch2); + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst2); + MORPH_IDT_FILLTILE(&dst_t, value, 1); + } + } + if (dim3SizeRearEnd > 0) + { + /***********************************************************************************/ + /* Rear end processing : Processing along the 3rd dimension edge 2. */ + /***********************************************************************************/ + + /* Update destination data pointer */ + pDst1 = &pDst[(dim3CordRearEnd * dstDataPitch2)]; + XAI_TILE3D_SET_DIM3(&dst_t, 1); + for (z = 0; z < dim3SizeRearEnd; z++) /* Loop across dim3 */ + { + /* update destination data pointer */ + MORPH_IDT_SCALAR *pDst2 = pDst1 + (z * dstDataPitch2); + value = pValue[z + dim3CordRearEnd + dim3Edge1]; + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst2); + MORPH_IDT_FILLTILE(&dst_t, value, 1); + } + } + + /* Update destination data pointer */ + pDst1 = &pDst[(dim3CordMiddle * dstDataPitch2)]; + XAI_TILE3D_SET_DIM3(&dst_t, dim3SizeMiddle); + + XAI_TILE3D_SET_DATA_PTR(&dst_t, pDst1); + MORPH_OP_FUNCTION(&dst_t, pValue + dim3CordMiddle + dim3Edge1, frame3DSize); +} + +/*************************** extendEdges3D_I8_DWH *********************************/ +/*************************** extendEdges3D_I16_DWH ********************************/ +/*************************** extendEdges3D_F16_DWH ********************************/ +/*************************** extendEdges3D_F32_DWH ********************************/ +/* Description : P6 optimized generic implementation of xaiExtendEdgesConst 3D */ +/* function. Based on MORPH pre-processor specifiers, code */ +/* implementation is generated during preprocessing stage. This */ +/* method implements extendEdges3D_I8_DWH and extendEdges3D_I16_DWH */ +/* extendEdges3D_F16_DWH and extendEdges3D_F32_DWH functionality. */ +/* Inputs : constant value to fill the edges */ +/* Outputs : XI Error Code */ +/* InOuts : Destination Tile */ +/* Assumptions : OutData is signed/unsigned 8/16 bit Interger or */ +/* half precision float(FP16) or single precision float(FP32) */ +/* based on MORPH specifier */ +/**********************************************************************************/ + +static _XAI_INLINE_ void MAKE_NAME_1(extendEdges3D, DWH) (xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(dstTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(dstTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(dstTile); + const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile); + const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile); + const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile); + const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile); + const int32_t dim3Edge1 = XAI_TILE3D_GET_DIM3_EDGE1(dstTile); + const int32_t dim3Edge2 = XAI_TILE3D_GET_DIM3_EDGE2(dstTile); + const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile); + const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile); + const int32_t bytesPerPixel = XAI_TILE3D_GET_ELEMENT_SIZE(dstTile); + + int32_t frame_dim1 = frame3DSize.dim1Size; + int32_t frame_dim2 = frame3DSize.dim2Size; + int32_t frame_dim3 = frame3DSize.dim3Size; + int32_t start_x = XAI_TILE3D_GET_DIM1_COORD(dstTile); // along Depth + int32_t start_y = XAI_TILE3D_GET_DIM2_COORD(dstTile); // along Width + int32_t start_z = XAI_TILE3D_GET_DIM3_COORD(dstTile); // along Height + + int32_t ixmin = MAX2(start_x - dim1Edge1, 0); + int32_t ixmax = MIN2(start_x + dim1Size + dim1Edge2 - 1, frame_dim1 - 1); + int32_t iymin = MAX2(start_y - dim2Edge1, 0); + int32_t iymax = MIN2(start_y + dim2Size + dim2Edge2 - 1, frame_dim2 - 1); + int32_t izmin = MAX2(start_z - dim3Edge1, 0); + int32_t izmax = MIN2(start_z + dim3Size + dim3Edge2 - 1, frame_dim3 - 1); + + // horizontal top + int32_t horTopXcord = -dim1Edge1; + int32_t horTopYcord = -dim2Edge1; + int32_t horTopWidth = dim1Size + dim1Edge1 + dim1Edge2; + int32_t horTopHeight = iymin - (start_y - dim2Edge1); + + // horizontal bottom + int32_t horBottomXcord = -dim1Edge1; + int32_t horBottomYcord = iymax + 1 - start_y; + int32_t horBottomWidth = dim1Size + dim1Edge1 + dim1Edge2; + int32_t horBottomHeight = start_y + dim2Size + dim2Edge2 - 1 - iymax; + + // vertical left + int32_t verLeftXcord = -dim1Edge1; + int32_t verLeftYcord = horTopYcord + horTopHeight; + int32_t verLeftWidth = ixmin - (start_x - dim1Edge1); + int32_t verLeftHeight = iymax - iymin + 1; + + // vertical right + int32_t verRightXcord = ixmax + 1 - start_x; + int32_t verRightYcord = horTopYcord + horTopHeight; + int32_t verRightWidth = start_x + dim1Size + dim1Edge2 - 1 - ixmax; + int32_t verRightHeight = iymax - iymin + 1; + + // front + int32_t frontXcord = -dim1Edge1; + int32_t frontYcord = horTopYcord + horTopHeight; + int32_t frontZcord = -dim3Edge1; + int32_t frontDepth = izmin - (start_z - dim3Edge1); + int32_t frontWidth = horTopWidth; + int32_t frontHeight = iymax - iymin + 1; + + // rear + int32_t rearXcord = -dim1Edge1; + int32_t rearYcord = horTopYcord + horTopHeight; + int32_t rearZcord = izmax + 1 - start_z; + int32_t rearDepth = start_z + dim3Size + dim3Edge2 - 1 - izmax; + int32_t rearWidth = horTopWidth; + int32_t rearHeight = iymax - iymin + 1; + + int x, y, z; /* Loop variables */ + valign vaOutData = IVP_ZALIGN(); + valign vaArray; + int32_t vectorizationWidth = MORPH_VECTORIZATIONWIDTH; + + MORPH_IDT_SCALAR *restrict pDst3D = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile); + MORPH_IDT_SCALAR *restrict pArr = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(pArray) + dim1Edge1; + + MORPH_IDT_VEC *restrict pdvecArr, *restrict pdvecDst; + MORPH_IDT_VEC dvecArrData; + + /* Tile and frame intersection is empty,fill entire tile with edge values */ + if ((ixmin > ixmax) || (iymin > iymax) || (izmin > izmax)) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr - dim1Edge1); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < (dim1Size + dim1Edge1 + dim1Edge2); x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (dim1Size + dim1Edge1 + dim1Edge2 - x) * bytesPerPixel); + + for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++) + { + for (y = 0; y < (dim2Size + dim2Edge1 + dim2Edge2); y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \ + (y - dim2Edge1) * dstDataPitch1 + (-dim1Edge1) + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (dim1Size + dim1Edge1 + dim1Edge2 - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + else + { + /* Front Height Edge */ + if (frontDepth > 0) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr + frontXcord); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < frontWidth; x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (frontWidth - x) * bytesPerPixel); + + for (z = 0; z < frontDepth; z++) + { + for (y = 0; y < frontHeight; y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (frontZcord + z) * dstDataPitch2 + \ + (y + frontYcord) * dstDataPitch1 + frontXcord + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (frontWidth - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + + /* Rear Height Edge */ + if (rearDepth > 0) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr + rearXcord); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < rearWidth; x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (rearWidth - x) * bytesPerPixel); + + for (z = 0; z < rearDepth; z++) + { + for (y = 0; y < rearHeight; y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (rearZcord + z) * dstDataPitch2 + \ + (y + rearYcord) * dstDataPitch1 + rearXcord + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (rearWidth - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + + /* Top Width Edge */ + if (horTopHeight > 0) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr + horTopXcord); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < horTopWidth; x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (horTopWidth - x) * bytesPerPixel); + + for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++) + { + for (y = 0; y < horTopHeight; y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \ + (horTopYcord + y) * dstDataPitch1 + horTopXcord + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (horTopWidth - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + + /* Bottom Width Edge */ + if (horBottomHeight > 0) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr + horBottomXcord); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < horBottomWidth; x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (horBottomWidth - x) * bytesPerPixel); + + for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++) + { + for (y = 0; y < horBottomHeight; y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \ + (horBottomYcord + y) * dstDataPitch1 + horBottomXcord + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (horBottomWidth - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + + /* Left Depth Edge */ + if (verLeftWidth > 0) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr + verLeftXcord); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < verLeftWidth; x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (verLeftWidth - x) * bytesPerPixel); + + for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++) + { + for (y = 0; y < verLeftHeight; y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \ + (verLeftYcord + y) * dstDataPitch1 + verLeftXcord + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (verLeftWidth - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + + /* Right Depth Edge */ + if (verRightWidth > 0) + { + pdvecArr = (MORPH_IDT_VEC *) (pArr + verRightXcord); + + /* priming of pArray */ + vaArray = MORPH_OP_PRIME(pdvecArr); + + for (x = 0; x < verRightWidth; x += vectorizationWidth) + { + /* Load pArray */ + MORPH_OP_LOAD(dvecArrData, vaArray, pdvecArr, (verRightWidth - x) * bytesPerPixel); + + for (z = 0; z < (dim3Size + dim3Edge1 + dim3Edge2); z++) + { + for (y = 0; y < verRightHeight; y++) + { + pdvecDst = (MORPH_IDT_VEC *) (pDst3D + (z - dim3Edge1) * dstDataPitch2 + \ + (verRightYcord + y) * dstDataPitch1 + verRightXcord + x); + + /* store array value in destination */ + MORPH_OP_STORE(dvecArrData, vaOutData, pdvecDst, (verRightWidth - x) * bytesPerPixel); + + MORPH_OP_FLUSH(vaOutData, pdvecDst); + } + } + } + } + } +} + +#if INPUT_DATA_TYPE == INTEGER8BIT +/*********************** xaiExtendEdges3D_I8 *****************************/ +/* Description : General API for ExtendEdges3D optimized implementation */ +/* Calls one of the ExtendEdges3D functions based */ +/* on the parameters */ +/* Inputs : pArray, frame3DSize */ +/* Outputs : XI Error Code */ +/* InOuts : Input Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiExtendEdges3D_I8(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_I8(dstTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile); + XAI_CHECK_ERROR( + ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \ + XAI_ERR_BADARG, "Provided Data Order not supported."); + XAI_CHECK_POINTER(pArray); + XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid"); + XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) && \ + (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE, \ + "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \ + frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size); + } + if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile) \ + + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_I8_WHD(dstTile, pArray, frame3DSize); + } + else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile) \ + + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_I8_DWH(dstTile, pArray, frame3DSize); + } + else + { + return(XAI_ERR_NO_VARIANT); + } + + return(XAI_ERROR_STATUS()); +} + +#elif INPUT_DATA_TYPE == INTEGER16BIT +/*********************** xaiExtendEdges3D_I16 ****************************/ +/* Description : General API for ExtendEdges3D optimized implementation */ +/* Calls one of the ExtendEdges3D functions based */ +/* on the parameters */ +/* Inputs : pArray, frame3DSize */ +/* Outputs : XI Error Code */ +/* InOuts : Input Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiExtendEdges3D_I16(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_X16(dstTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile); + XAI_CHECK_ERROR( + ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \ + XAI_ERR_BADARG, "Provided Data Order not supported."); + XAI_CHECK_POINTER(pArray); + XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid"); + XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) && \ + (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE, \ + "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \ + frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size); + } + if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile) \ + + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_I16_WHD(dstTile, pArray, frame3DSize); + } + else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile) \ + + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_I16_DWH(dstTile, pArray, frame3DSize); + } + else + { + return(XAI_ERR_NO_VARIANT); + } + + return(XAI_ERROR_STATUS()); +} + +#elif INPUT_DATA_TYPE == FLOAT16BIT +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +/*********************** xaiExtendEdges3D_F16 ****************************/ +/* Description : General API for ExtendEdges3D optimized implementation */ +/* Calls one of the ExtendEdges3D functions based */ +/* on the parameters */ +/* Inputs : pArray, frame3DSize */ +/* Outputs : XI Error Code */ +/* InOuts : Input Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiExtendEdges3D_F16(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_F16(dstTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile); + XAI_CHECK_ERROR( + ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \ + XAI_ERR_BADARG, "Provided Data Order not supported."); + XAI_CHECK_POINTER(pArray); + XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid"); + XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) && \ + (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE, \ + "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \ + frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size); + } + if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile) \ + + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_F16_WHD(dstTile, pArray, frame3DSize); + } + else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile) \ + + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_F16_DWH(dstTile, pArray, frame3DSize); + } + return(XAI_ERROR_STATUS()); +} +#endif //#if (XCHAL_HAVE_VISION_HP_VFPU == 1) + +#elif INPUT_DATA_TYPE == FLOAT32BIT +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +/*********************** xaiExtendEdges3D_F32 ****************************/ +/* Description : General API for ExtendEdges3D optimized implementation */ +/* Calls one of the ExtendEdges3D functions based */ +/* on the parameters */ +/* Inputs : pArray, frame3DSize */ +/* Outputs : XI Error Code */ +/* InOuts : Input Tile */ +/****************************************************************************/ +XAI_ERR_TYPE xaiExtendEdges3D_F32(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_F32(dstTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile); + XAI_CHECK_ERROR( + ((XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) || (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH)), \ + XAI_ERR_BADARG, "Provided Data Order not supported."); + XAI_CHECK_POINTER(pArray); + XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(pArray), XAI_ERR_BADARG, "The argument pArray is invalid"); + XAI_CHECK_ERROR((frame3DSize.dim1Size > 0) && (frame3DSize.dim2Size > 0) && \ + (frame3DSize.dim3Size > 0), XAI_ERR_DATASIZE, \ + "\nframe3DSize.dim1Size = %d, frame3DSize.dim2Size = %d, frame3DSize.dim3Size = %d\nDimensions must be greater than 0", \ + frame3DSize.dim1Size, frame3DSize.dim2Size, frame3DSize.dim3Size); + } + if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_WHD) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM3(dstTile) \ + + XAI_TILE3D_GET_DIM3_EDGE1(dstTile) + XAI_TILE3D_GET_DIM3_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_F32_WHD(dstTile, pArray, frame3DSize); + } + else if (XAI_TILE3D_GET_DATA_ORDER(dstTile) == XAI_DWH) + { + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR( + ((XAI_ARRAY_GET_WIDTH(pArray) >= (XAI_TILE3D_GET_DIM1(dstTile) \ + + XAI_TILE3D_GET_DIM1_EDGE1(dstTile) + XAI_TILE3D_GET_DIM1_EDGE2(dstTile)))), XAI_ERR_BADARG, \ + "pArray width parameter is not set as required"); + } + extendEdges3D_F32_DWH(dstTile, pArray, frame3DSize); + } + return(XAI_ERROR_STATUS()); +} +#endif //#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +#endif //INPUT_DATA_TYPE + +/*====================================================================================*/ +/*=============== END of xaiExtendEdges3D_* routines =================================*/ +/*====================================================================================*/ +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h new file mode 100644 index 00000000000..dcabdd096f6 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_fill_tile.h @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2021 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#if ((XCHAL_VISION_TYPE >= 6)) + + + +#define MAKE_NAME_IMPL(name, MORPH_FNAME_SPECIFIER_IDT) name ## _ ## MORPH_FNAME_SPECIFIER_IDT + +#if INPUT_DATA_TYPE == INTEGER8BIT + +#undef MAKE_ARGUMENTS +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_VECTORIZATION_WIDTH +#undef MORPH_OP_STORE_IP +#undef MORPH_OP_VAR_STORE_XP +#undef MORPH_OP_PRIME +#undef MORPH_OP_FLUSH +#undef MORPH_BYTES_PER_PIXEL + +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const int32_t b, xai_bool c) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, I8) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_I8 +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_VECTOR xb_vec2Nx8 +#define MORPH_VECTORIZATION_WIDTH (2 * XCHAL_IVPN_SIMD_WIDTH) +#define MORPH_OP_STORE_IP IVP_SA2NX8_IP +#define MORPH_OP_VAR_STORE_XP IVP_SAV2NX8_XP +#define MORPH_OP_PRIME IVP_LA2NX8_PP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP +#define MORPH_BYTES_PER_PIXEL 1 + +#elif INPUT_DATA_TYPE == INTEGER16BIT + +#undef MAKE_ARGUMENTS +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_VECTORIZATION_WIDTH +#undef MORPH_OP_STORE_IP +#undef MORPH_OP_VAR_STORE_XP +#undef MORPH_OP_PRIME +#undef MORPH_OP_FLUSH +#undef MORPH_BYTES_PER_PIXEL + +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const int32_t b, xai_bool c) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, I16) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_I16 +#define MORPH_IDT_SCALAR int16_t +#define MORPH_IDT_VECTOR xb_vecNx16 +#define MORPH_VECTORIZATION_WIDTH (XCHAL_IVPN_SIMD_WIDTH) +#define MORPH_OP_STORE_IP IVP_SANX16_IP +#define MORPH_OP_VAR_STORE_XP IVP_SAVNX16_XP +#define MORPH_OP_PRIME IVP_LANX16_PP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP +#define MORPH_BYTES_PER_PIXEL 2 + +#elif INPUT_DATA_TYPE == FLOAT16BIT + +#undef MAKE_ARGUMENTS +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_VECTORIZATION_WIDTH +#undef MORPH_OP_STORE_IP +#undef MORPH_OP_VAR_STORE_XP +#undef MORPH_OP_PRIME +#undef MORPH_OP_FLUSH +#undef MORPH_BYTES_PER_PIXEL + +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const xb_f16 b, xai_bool c) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, F16) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#define MORPH_IDT_VECTOR xb_vecNxf16 +#define MORPH_VECTORIZATION_WIDTH (XCHAL_IVPN_SIMD_WIDTH) +#define MORPH_OP_STORE_IP IVP_SANXF16_IP +#define MORPH_OP_VAR_STORE_XP IVP_SAVNXF16_XP +#define MORPH_OP_PRIME IVP_LANXF16_PP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#define MORPH_BYTES_PER_PIXEL 2 +#endif + +#elif INPUT_DATA_TYPE == FLOAT32BIT + +#undef MAKE_ARGUMENTS +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_VECTORIZATION_WIDTH +#undef MORPH_OP_STORE_IP +#undef MORPH_OP_VAR_STORE_XP +#undef MORPH_OP_PRIME +#undef MORPH_OP_FLUSH +#undef MORPH_BYTES_PER_PIXEL + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +#define MAKE_ARGUMENTS(a, b, c) (xai_pTile3D a, const float b, xai_bool c) +#define MAKE_NAME(name) MAKE_NAME_IMPL(name, F32) +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#define MORPH_IDT_VECTOR xb_vecN_2xf32 +#define MORPH_VECTORIZATION_WIDTH (XCHAL_IVPN_SIMD_WIDTH / 2) +#define MORPH_OP_STORE_IP IVP_SAN_2XF32_IP +#define MORPH_OP_VAR_STORE_XP IVP_SAVN_2XF32_XP +#define MORPH_OP_PRIME IVP_LAN_2XF32_PP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#define MORPH_BYTES_PER_PIXEL 4 +#endif +#endif + +/**************************************************************************************/ +/* MAKE_NAME(xaiFillTile3D) */ +/**************************************************************************************/ + +/******************************* xaiFillTile3D *************************************/ +/* Description : P6 optimized generic implementation of FillTile 3D function. */ +/* Based on MORPH pre-processor specifiers, code implementation */ +/* is generated during pre-processing stage. This method implements */ +/* xaiFillTile3D_I8, xaiFillTile3D_I16, xaiFillTile3D_F16 and */ +/* xaiFillTile3D_F32 functionality. */ +/* Inputs : Constant value to fill, fill_edge_extension */ +/* Outputs : XI Error Code */ +/* InOuts : Output Tile */ +/* Assumptions : OutData is signed 8/16 bit Integer or half precision float(FP16) or */ +/* single precision float(FP32) based on MORPH specifier */ +/**************************************************************************************/ + +/****************************** xaiFillTile3D_I8 ***************************************/ +/****************************** xaiFillTile3D_I16 **************************************/ +/****************************** xaiFillTile3D_F16 **************************************/ +/****************************** xaiFillTile3D_F32 **************************************/ + +XAI_ERR_TYPE MAKE_NAME(xaiFillTile3D) MAKE_ARGUMENTS(dstTile, value, fill_edge_extension) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(dstTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(dstTile); + } + + /* Getting parameters from the tile structures */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(dstTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(dstTile); + const int32_t dim1Edge1 = XAI_TILE3D_GET_DIM1_EDGE1(dstTile); + const int32_t dim1Edge2 = XAI_TILE3D_GET_DIM1_EDGE2(dstTile); + const int32_t dim2Edge1 = XAI_TILE3D_GET_DIM2_EDGE1(dstTile); + const int32_t dim2Edge2 = XAI_TILE3D_GET_DIM2_EDGE2(dstTile); + const int32_t dim3Edge1 = XAI_TILE3D_GET_DIM3_EDGE1(dstTile); + const int32_t dim3Edge2 = XAI_TILE3D_GET_DIM3_EDGE2(dstTile); + const int32_t dstDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(dstTile); + const int32_t dstDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(dstTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(dstTile); + MORPH_IDT_SCALAR *pDst = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(dstTile); + + int32_t z, x, y; + /* Vectorization for xaiFillTile3D function is always done across the first dimension */ + int32_t vectorizationWidth = MORPH_VECTORIZATION_WIDTH; + int32_t dim1FillSize = dim1Size; + int32_t dim2FillSize = dim2Size; + int32_t dim3FillSize = dim3Size; + int32_t maxLoopCount; + + MORPH_IDT_VECTOR* restrict pdvecOut; + valign vaOutData = IVP_ZALIGN(); + MORPH_IDT_VECTOR vecValue = value; + + /* If fill_edge_extension flag is enabled update destination data pointer */ + /* and data fill size across all 3 dimensions. */ + + if (fill_edge_extension) + { + dim1FillSize = dim1Size + dim1Edge1 + dim1Edge2; + dim2FillSize = dim2Size + dim2Edge1 + dim2Edge2; + dim3FillSize = dim3Size + dim3Edge1 + dim3Edge2; + pDst = &pDst[-dim1Edge1 + ((-dim2Edge1) * dstDataPitch1) + ((-dim3Edge1) * dstDataPitch2)]; + } + + /******************************************************************************/ + /* The overall design approach is split into 2 parts */ + /* 1. When destination tile pitch is equal to destination tile fill size. */ + /* - If above condition holds good, memory location to be filled */ + /* with constant value is contiguous. Hence vectorization can be */ + /* utilized effectively */ + /* 2. When destination tile pitch is greater than destination tile fill size. */ + /* - If above condition holds good, memory location to be filled */ + /* with constant value is not contiguous. In order to do */ + /* vectorization across first dimension, destination data pointers */ + /* need to be updated based on destination tile fill size and */ + /* destination tile pitch */ + /******************************************************************************/ + if (dstDataPitch1 == dim1FillSize) + { + /* Data to be filled exist in contiguous memory location with respect to */ + /* first dimension */ + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3FillSize; + maxLoopCount = dim1FillSize * dim2FillSize; + if (dstDataPitch2 == maxLoopCount) + { + /* Data to be filled exist in contiguous memory location with respect to */ + /* first and second dimension */ + + /* Update max loop counter */ + maxLoopCount *= dim3FillSize; + dim3MaxLoopCount = 1; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + /* initialize destination data pointer */ + pdvecOut = (MORPH_IDT_VECTOR *) (pDst + (z * dstDataPitch2)); + for (x = 0; x < maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + } + + MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut, + (maxLoopCount - x) * MORPH_BYTES_PER_PIXEL); + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } + } + else + { + /* else block execute if destination tile pitch is */ + /* greater than destination tile fill size */ + for (z = 0; z < dim3FillSize; z++) /* Loop across dim3 */ + { + x = 0; + /* Loop across dimension 1 */ + /* Condition check added to maximize vectorization across dimension 1*/ + /* Loop across dim1 */ + for (; x < (dim1FillSize - 3 * vectorizationWidth); x += 4 * vectorizationWidth) + { + /* initialize destination data pointer */ + MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2); + for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */ + { + pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1)); + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut, + (dim1FillSize - (x + 3 * vectorizationWidth)) * MORPH_BYTES_PER_PIXEL); + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } + } + if (x < (dim1FillSize - 2 * vectorizationWidth)) + { + /* initialize destination data pointer */ + MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2); + for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */ + { + pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1)); + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut, + (dim1FillSize - (x + 2 * vectorizationWidth)) * MORPH_BYTES_PER_PIXEL); + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } + } + else if (x < (dim1FillSize - vectorizationWidth)) + { + /* initialize destination data pointer */ + MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2); + for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */ + { + pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1)); + MORPH_OP_STORE_IP(vecValue, vaOutData, pdvecOut); + MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut, + (dim1FillSize - (x + vectorizationWidth)) * MORPH_BYTES_PER_PIXEL); + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } + } + else if (x < dim1FillSize) + { + /* initialize destination data pointer */ + MORPH_IDT_SCALAR *pDst1 = pDst + x + (z * dstDataPitch2); + for (y = 0; y < dim2FillSize; y++) /* Loop across dim2 */ + { + pdvecOut = (MORPH_IDT_VECTOR *) (pDst1 + (y * dstDataPitch1)); + MORPH_OP_VAR_STORE_XP(vecValue, vaOutData, pdvecOut, + (dim1FillSize - x) * MORPH_BYTES_PER_PIXEL); + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } + } + } + } + return(XAI_ERROR_STATUS()); +} +#endif //if ((XCHAL_VISION_TYPE >= 6)) diff --git a/backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c new file mode 100644 index 00000000000..e024b12440a --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/cnn/src/cnn_helper.c @@ -0,0 +1,2141 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn.h" +#include "xai_intrin.h" + +#if ((XCHAL_VISION_TYPE >= 6)) + +#define S24_MIN (-(((int32_t) 1) << 23)) +#define S24_MAX ((((int32_t) 1) << 23) - 1) + +/****************************************************************************/ +/* Description : Implementation for getting the sub-kernel and */ +/* super kernel related information. */ +/* If getNumKernelsFlag is passed as 1, function returns the */ +/* number of sub-kernels. */ +/* If getNumKernelsFlag is passed as 0, function returns the */ +/* tile dimension for the sub-kernels. */ +/* Inputs : Input Coeff Tile, stride along X & Y directions, */ +/* getNumKernelsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub, numSubKernels. */ +/* Assumptions : Coeff is in WHDN format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvGetDim4D_WHDN(const xai_pTile4D coeffTile, + xai_pTile4D subCoeffInfo[], + uint16_t *numSubKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + if (getNumKernelsFlag) + { + XAI_CHECK_POINTER(numSubKernels); + } + XAI_CHECK_ERROR((strideX > 0) && (strideY > 0), \ + XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + } + if (getNumKernelsFlag) + { + *numSubKernels = strideX * strideY; + return(XAI_ERROR_STATUS()); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE4D(coeffTile); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_WHDN); + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]); + } + } + } + + const int32_t kWidth = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t kHeight = XAI_TILE4D_GET_DIM2(coeffTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), XAI_ERR_BADARG, \ + "\nstrideX = %hhu, kWidth = %d and strideY = %hhu, kHeight = %d\nStride should be less than corresponding Kernel Dimension", \ + strideX, kWidth, strideY, kHeight); + } + + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + + XAI_TILE4D_SET_DIM1(subCoeffInfo[kernelIdx], \ + (kWidth + strideX - kIdx - 1) / strideX); + XAI_TILE4D_SET_DIM2(subCoeffInfo[kernelIdx], \ + (kHeight + strideY - kIdy - 1) / strideY); + XAI_TILE4D_SET_DIM3(subCoeffInfo[kernelIdx], \ + XAI_TILE4D_GET_DIM4(coeffTile)); + XAI_TILE4D_SET_DIM4(subCoeffInfo[kernelIdx], \ + XAI_TILE4D_GET_DIM3(coeffTile)); + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for getting the sub-kernel */ +/* related information. */ +/* If getNumKernelsFlag is passed as 1, function returns the */ +/* number of sub-kernels. */ +/* If getNumKernelsFlag is passed as 0, function returns the */ +/* tile dimension for the sub-kernels. */ +/* Inputs : Input Coeff Tile, stride along X & Y directions, */ +/* getNumKernelsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub, numSubKernels. */ +/* Assumptions : Coeff is in WHD format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvGetDim3D_WHD(const xai_pTile3D coeffTile, + xai_pTile3D subCoeffInfo[], + uint16_t *numSubKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + if (getNumKernelsFlag) + { + XAI_CHECK_POINTER(numSubKernels); + } + XAI_CHECK_ERROR((strideX > 0) && (strideY > 0), \ + XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + } + if (getNumKernelsFlag) + { + *numSubKernels = strideX * strideY; + return(XAI_ERROR_STATUS()); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(coeffTile, XAI_WHD); + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]); + } + } + } + + const int32_t kWidth = XAI_TILE3D_GET_DIM1(coeffTile); + const int32_t kHeight = XAI_TILE3D_GET_DIM2(coeffTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), \ + XAI_ERR_BADARG, \ + "\nstrideX = %hhu, kWidth = %d and strideY = %hhu, kHeight = %d\nStride should be less than corresponding Kernel Dimension", \ + strideX, kWidth, strideY, kHeight); + } + + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + + XAI_TILE3D_SET_DIM1(subCoeffInfo[kernelIdx], \ + (kWidth + strideX - kIdx - 1) / strideX); + XAI_TILE3D_SET_DIM2(subCoeffInfo[kernelIdx], \ + (kHeight + strideY - kIdy - 1) / strideY); + XAI_TILE3D_SET_DIM3(subCoeffInfo[kernelIdx], \ + XAI_TILE4D_GET_DIM3(coeffTile)); + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for coefficient reordering */ +/* The functions does the following: */ +/* - Convert from WHDN->WHND */ +/* - Flips the coefficients across width and height which is */ +/* controlled by transposeCoeffsFlag. */ +/* - Breaks the kernel into sub-kernels. */ +/* Inputs : Input Coeff Tile, CNN convolution params structure, */ +/* transposeCoeffsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub & Super Tiles */ +/* Assumptions : CoeffData is S8/U8 */ +/* Coeff is in WHDN format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvReOrder4D_I8_WHDN(const xai_pTile4D inTile, + xai_pTile4D subCoeffs[], + const xai_cnn_conv_params *param, + const uint8_t transposeCoeffsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_I8(inTile); + XAI_CHECK_TILE4D_DATA_ORDER(inTile, XAI_WHDN); + XAI_CHECK_POINTER(param); + XAI_CHECK_POINTER(subCoeffs); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) <= XAI_TILE4D_GET_DIM1(inTile))) && \ + ((XAI_CNN_CONV_GET_STRIDEY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) <= XAI_TILE4D_GET_DIM2(inTile))), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \ + \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_TILE4D_GET_DIM1(inTile), \ + XAI_CNN_CONV_GET_STRIDEY(param), XAI_TILE4D_GET_DIM2(inTile)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \ + XAI_ERR_BADARG, "\nDilation parameter is %hhu\nDilation parameter should be equal to 1", XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + for (kIdy = 0; kIdy < XAI_CNN_CONV_GET_STRIDEY(param); kIdy++) + { + for (kIdx = 0; kIdx < XAI_CNN_CONV_GET_STRIDEX(param); kIdx++) + { + kernelIdx = kIdy * XAI_CNN_CONV_GET_STRIDEX(param) + kIdx; + XAI_CHECK_TILE4D_I8(subCoeffs[kernelIdx]); + XAI_CHECK_TILE4D_DATA_ORDER(subCoeffs[kernelIdx], XAI_WHDN); + } + } + } + int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile); + + const int32_t kWidth = XAI_TILE4D_GET_DIM1(inTile); /* W */ + const int32_t kHeight = XAI_TILE4D_GET_DIM2(inTile); /* H */ + const int32_t numInCh = XAI_TILE4D_GET_DIM3(inTile); /* D */ + const int32_t numOutCh = XAI_TILE4D_GET_DIM4(inTile); /* N */ + + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + + int32_t inCoeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(inTile); + int32_t inCoeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(inTile); + int32_t inCoeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(inTile); + + int32_t kx, ky, inCh, outCh, inIdx, outIdx = 0; + int8_t *pSubCoeff; + int32_t kxStart, kyStart; + + + if (transposeCoeffsFlag) + { + /* Conversion from WHDN -> WHND, */ + /* transposing of kernels and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + int8_t *pSubCoeff = \ + (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY); + kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX); + + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (ky = kyStart; ky >= 0; ky -= strideY) /* H */ + { + for (kx = kxStart; kx >= 0; kx -= strideX) /* W */ + { + inIdx = outCh * inCoeffPitch3 + inCh * inCoeffPitch2 + \ + ky * inCoeffPitch1 + kx; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + } + } + } + } + } + } + else + { + /* Conversion from WHDN -> WHND and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = ((kHeight + strideY - kIdy - 1) % strideY); + kxStart = ((kWidth + strideX - kIdx - 1) % strideX); + + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (ky = kyStart; ky < kHeight; ky += strideY) /* H */ + { + for (kx = kxStart; kx < kWidth; kx += strideX) /* W */ + { + inIdx = outCh * inCoeffPitch3 + inCh * inCoeffPitch2 + \ + ky * inCoeffPitch1 + kx; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + } + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for coefficient reordering */ +/* The functions does the following: */ +/* - Flips the coefficients across width and height which is */ +/* controlled by transposeCoeffsFlag. */ +/* - Breaks the kernel into sub-kernels. */ +/* Inputs : Input Coeff Tile, CNN convolution params structure, */ +/* transposeCoeffsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub & Super Tiles */ +/* Assumptions : CoeffData is S8/U8 */ +/* Coeff is in WHD format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvReOrder3D_I8_WHD(const xai_pTile3D inTile, + xai_pTile3D subCoeffs[], + const xai_cnn_depthwiseDilatedConv_params *param, + const uint8_t transposeCoeffsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_I8(inTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_WHD); + XAI_CHECK_POINTER(param); + XAI_CHECK_POINTER(subCoeffs); + XAI_CHECK_ERROR(((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) >= 1) && \ + (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) <= XAI_TILE3D_GET_DIM1(inTile))) && \ + ((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) >= 1) && \ + (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) <= XAI_TILE3D_GET_DIM2(inTile))), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \ + \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param), XAI_TILE3D_GET_DIM1(inTile), \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param), XAI_TILE3D_GET_DIM2(inTile)); + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) == 1), \ + XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) == XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param)); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + for (kIdy = 0; kIdy < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param); kIdy++) + { + for (kIdx = 0; kIdx < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param); kIdx++) + { + kernelIdx = kIdy * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) + kIdx; + XAI_CHECK_TILE3D_I8(subCoeffs[kernelIdx]); + XAI_CHECK_TILE3D_DATA_ORDER(subCoeffs[kernelIdx], XAI_WHD); + } + } + } + int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile); + + const int32_t kWidth = XAI_TILE4D_GET_DIM1(inTile); /* W */ + const int32_t kHeight = XAI_TILE4D_GET_DIM2(inTile); /* H */ + const int32_t numInCh = XAI_TILE4D_GET_DIM3(inTile); /* D */ + + + const uint8_t strideX = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param); + + int32_t inCoeffPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + int32_t inCoeffPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + int32_t kx, ky, inCh, inIdx, outIdx = 0; + int8_t *pSubCoeff; + int32_t kxStart, kyStart; + + + if (transposeCoeffsFlag) + { + /* transposing of kernels and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + int8_t *pSubCoeff = \ + (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY); + kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX); + + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + for (ky = kyStart; ky >= 0; ky -= strideY) /* H */ + { + for (kx = kxStart; kx >= 0; kx -= strideX) /* W */ + { + inIdx = inCh * inCoeffPitch2 + \ + ky * inCoeffPitch1 + kx; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + } + } + } + } + } + else + { + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + pSubCoeff = (int8_t *) XAI_TILE3D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = ((kHeight + strideY - kIdy - 1) % strideY); + kxStart = ((kWidth + strideX - kIdx - 1) % strideX); + + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + for (ky = kyStart; ky < kHeight; ky += strideY) /* H */ + { + for (kx = kxStart; kx < kWidth; kx += strideX) /* W */ + { + inIdx = inCh * inCoeffPitch2 + \ + ky * inCoeffPitch1 + kx; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for extending the bias array in */ +/* case of MOD deconvolution using superkernels. */ +/* Inputs : Input Bias array, */ +/* Outputs : XI Error Code */ +/* InOuts : Output Bias array */ +/****************************************************************************/ +XAI_ERR_TYPE xaiBiasExtend_S32_MOD(const xai_pArray inBiasArray, + xai_pArray outBiasArray) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_ARRAY_S32(inBiasArray); + XAI_CHECK_ARRAY_S32(outBiasArray); + } + + int32_t inWidth = XAI_ARRAY_GET_WIDTH(inBiasArray); + int32_t outWidth = XAI_ARRAY_GET_WIDTH(outBiasArray); + int32_t strideX = outWidth / inWidth; + + int32_t* pInBias = (int32_t *) XAI_ARRAY_GET_DATA_PTR(inBiasArray); + int32_t* pOutBias = (int32_t *) XAI_ARRAY_GET_DATA_PTR(outBiasArray); + + int32_t numX, inW; + for (numX = 0; numX < strideX; numX++) + { + for (inW = 0; inW < inWidth; inW++) + { + pOutBias[inW + inWidth * numX] = pInBias[inW]; + } + } + return(XAI_ERROR_STATUS()); +} + +/*****************************************************************************/ +/* Description : Implementation for extending the outputscale array */ +/* in case of MOD deconvolution using superkernels. */ +/* Inputs : outputScale array, */ +/* Outputs : XI Error Code */ +/* InOuts : extended outputScale array */ +/*****************************************************************************/ +XAI_ERR_TYPE xaiOutScaleExtend_U16_MOD(const xai_pArray outScaleArray, + xai_pArray extendedOutScaleArray) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_ARRAY_U16(outScaleArray); + XAI_CHECK_ARRAY_U16(extendedOutScaleArray); + } + + int32_t inWidth = XAI_ARRAY_GET_WIDTH(outScaleArray); + int32_t outWidth = XAI_ARRAY_GET_WIDTH(extendedOutScaleArray); + int32_t strideX = outWidth / inWidth; + + uint16_t* pInScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(outScaleArray); + uint16_t* pOutScale = (uint16_t *) XAI_ARRAY_GET_DATA_PTR(extendedOutScaleArray); + + int32_t numX, inW; + for (numX = 0; numX < strideX; numX++) + { + for (inW = 0; inW < inWidth; inW++) + { + pOutScale[inW + inWidth * numX] = pInScale[inW]; + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for getting the sub-kernel and */ +/* super kernel related information. */ +/* If getNumKernelsFlag is passed as 1, function returns the */ +/* number of sub-kernels and super kernels. */ +/* If getNumKernelsFlag is passed as 0, function returns the */ +/* tile dimension for the sub-kernels and super kernels. */ +/* Inputs : Input Coeff Tile, stride along X & Y directions, */ +/* getNumKernelsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub & Super Tiles, numSubKernels and */ +/* numSuperKernels */ +/* Assumptions : Coeff is in NDWH format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvGetDim4D_NDWH(const xai_pTile4D coeffTile, + xai_pTile4D subCoeffInfo[], + xai_pTile4D superCoeffInfo[], + uint16_t *numSubKernels, + uint16_t *numSuperKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + if (getNumKernelsFlag) + { + XAI_CHECK_POINTER(numSubKernels); + XAI_CHECK_POINTER(numSuperKernels); + } + XAI_CHECK_ERROR((strideX > 0) && (strideY > 0), \ + XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + } + if (getNumKernelsFlag) + { + *numSubKernels = strideX * strideY; + *numSuperKernels = strideY; + return(XAI_ERROR_STATUS()); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE4D(coeffTile); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_POINTER(subCoeffInfo); + XAI_CHECK_POINTER(superCoeffInfo); + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]); + } + XAI_CHECK_POINTER(superCoeffInfo[kIdy]); + } + } + + const int32_t kWidth = XAI_TILE4D_GET_DIM3(coeffTile); + const int32_t kHeight = XAI_TILE4D_GET_DIM4(coeffTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be less than or equal to %d(kernel Width) \ + \nStrideY = %hhu, value must be ess than or equal to %d(kernel Height)", \ + strideX, kWidth, strideY, kHeight); + } + + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + + XAI_TILE4D_SET_DIM1(subCoeffInfo[kernelIdx], XAI_TILE4D_GET_DIM2(coeffTile)); + XAI_TILE4D_SET_DIM2(subCoeffInfo[kernelIdx], XAI_TILE4D_GET_DIM1(coeffTile)); + XAI_TILE4D_SET_DIM3(subCoeffInfo[kernelIdx], (kWidth + strideX - kIdx - 1) / strideX); + XAI_TILE4D_SET_DIM4(subCoeffInfo[kernelIdx], (kHeight + strideY - kIdy - 1) / strideY); + } + XAI_TILE4D_SET_DIM1(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM1(subCoeffInfo[kIdy * strideX]) * strideX); + XAI_TILE4D_SET_DIM2(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM2(subCoeffInfo[kIdy * strideX])); + XAI_TILE4D_SET_DIM3(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM3(subCoeffInfo[kIdy * strideX])); + XAI_TILE4D_SET_DIM4(superCoeffInfo[kIdy], XAI_TILE4D_GET_DIM4(subCoeffInfo[kIdy * strideX])); + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for getting the sub-kernel */ +/* related information. */ +/* If getNumKernelsFlag is passed as 1, function returns the */ +/* number of sub-kernels . */ +/* If getNumKernelsFlag is passed as 0, function returns the */ +/* tile dimension for the sub-kernels . */ +/* Inputs : Input Coeff Tile, stride along X & Y directions, */ +/* getNumKernelsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub Tiles and numSubKernels */ +/* Assumptions : Coeff is in DWH format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvGetDim3D_DWH(const xai_pTile3D coeffTile, + xai_pTile3D subCoeffInfo[], + uint16_t *numSubKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + if (getNumKernelsFlag) + { + XAI_CHECK_POINTER(numSubKernels); + } + XAI_CHECK_ERROR((strideX > 0) && (strideY > 0), XAI_ERR_BADARG, \ + "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + } + if (getNumKernelsFlag) + { + *numSubKernels = strideX * strideY; + return(XAI_ERROR_STATUS()); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_TILE3D(coeffTile); + XAI_CHECK_TILE3D_DATA_ORDER(coeffTile, XAI_DWH); + XAI_CHECK_POINTER(subCoeffInfo); + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + XAI_CHECK_POINTER(subCoeffInfo[kernelIdx]); + } + } + } + + const int32_t kWidth = XAI_TILE3D_GET_DIM2(coeffTile); + const int32_t kHeight = XAI_TILE3D_GET_DIM3(coeffTile); + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR((strideX <= kWidth) && (strideY <= kHeight), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be less than or equal to %d(kernel Width) \ + \nStrideY = %hhu, value must be ess than or equal to %d(kernel Height)", \ + strideX, kWidth, strideY, kHeight); + } + + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + + XAI_TILE3D_SET_DIM1(subCoeffInfo[kernelIdx], XAI_TILE3D_GET_DIM1(coeffTile)); + XAI_TILE3D_SET_DIM2(subCoeffInfo[kernelIdx], (kWidth + strideX - kIdx - 1) / strideX); + XAI_TILE3D_SET_DIM3(subCoeffInfo[kernelIdx], (kHeight + strideY - kIdy - 1) / strideY); + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for coefficient reordering */ +/* The functions does the following: */ +/* - Convert from NDWH->DNWH */ +/* - Flips the coefficients across width and height which is */ +/* controlled by transposeCoeffsFlag. */ +/* - Breaks the kernel into sub-kernels. */ +/* - Stacks sub-kernels to form super kernels. */ +/* Inputs : Input Coeff Tile, CNN convolution params structure, */ +/* transposeCoeffsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub & Super Tiles */ +/* Assumptions : CoeffData is S8/U8 */ +/* Coeff is in NDWH format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvReOrder4D_I8_NDWH(const xai_pTile4D inTile, + xai_pTile4D subCoeffs[], + xai_pTile4D superCoeffs[], + const xai_cnn_conv_params *param, + const uint8_t transposeCoeffsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_I8(inTile); + XAI_CHECK_TILE4D_DATA_ORDER(inTile, XAI_NDWH); + XAI_CHECK_POINTER(param); + XAI_CHECK_POINTER(subCoeffs); + XAI_CHECK_POINTER(superCoeffs); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) <= XAI_TILE4D_GET_DIM3(inTile))) && \ + ((XAI_CNN_CONV_GET_STRIDEY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) <= XAI_TILE4D_GET_DIM4(inTile))), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \ + \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_TILE4D_GET_DIM3(inTile), \ + XAI_CNN_CONV_GET_STRIDEY(param), XAI_TILE4D_GET_DIM4(inTile)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \ + XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + for (kIdy = 0; kIdy < XAI_CNN_CONV_GET_STRIDEY(param); kIdy++) + { + for (kIdx = 0; kIdx < XAI_CNN_CONV_GET_STRIDEX(param); kIdx++) + { + kernelIdx = kIdy * XAI_CNN_CONV_GET_STRIDEX(param) + kIdx; + XAI_CHECK_TILE4D_I8(subCoeffs[kernelIdx]); + XAI_CHECK_TILE4D_DATA_ORDER(subCoeffs[kernelIdx], XAI_NDWH); + } + XAI_CHECK_TILE4D_I8(superCoeffs[kIdy]); + XAI_CHECK_TILE4D_DATA_ORDER(superCoeffs[kIdy], XAI_NDWH); + } + } + + int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile); + + const int32_t numOutCh = XAI_TILE4D_GET_DIM1(inTile); /* N */ + const int32_t numInCh = XAI_TILE4D_GET_DIM2(inTile); /* D */ + const int32_t kWidth = XAI_TILE4D_GET_DIM3(inTile); /* W */ + const int32_t kHeight = XAI_TILE4D_GET_DIM4(inTile); /* H */ + + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + + int32_t inCoeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(inTile); + int32_t inCoeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(inTile); + int32_t inCoeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(inTile); + + int32_t kx, ky, inCh, outCh, inIdx, outIdx = 0; + int8_t *pSuperCoeff; + int8_t *pSubCoeff; + int32_t subKPitch1, subKPitch2, subKPitch3; + int32_t superKPitch1, superKPitch2; + int32_t kW, kH, subkW; + int32_t numInChSubCoeff; + int32_t subKIdx; + + int32_t kxStart, kyStart; + + if (transposeCoeffsFlag) + { + /* Conversion from NDWH -> DNWH, */ + /* transposing of kernels and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + int8_t *pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY); + + for (ky = kyStart; ky >= 0; ky -= strideY) /* H */ + { + kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX); + + for (kx = kxStart; kx >= 0; kx -= strideX) /* W */ + { + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \ + inCh * inCoeffPitch1 + outCh; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + /* For stride alignment */ + outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0; + } + } + } + } + } + } + else + { + /* Conversion from NDWH -> DNWH and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + int8_t *pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = ((kHeight + strideY - kIdy - 1) % strideY); + + for (ky = kyStart; ky < kHeight; ky += strideY) /* H */ + { + kxStart = ((kWidth + strideX - kIdx - 1) % strideX); + + for (kx = kxStart; kx < kWidth; kx += strideX) /* W */ + { + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \ + inCh * inCoeffPitch1 + outCh; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + /* For stride alignment */ + outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0; + } + } + } + } + } + } + + /* Form super-kernels by stacking sub-kernels */ + for (kernelIdx = 0; kernelIdx < strideY; kernelIdx++) + { + pSuperCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(superCoeffs[kernelIdx]); + + kW = XAI_TILE4D_GET_DIM3(superCoeffs[kernelIdx]); + kH = XAI_TILE4D_GET_DIM4(superCoeffs[kernelIdx]); + + numInChSubCoeff = XAI_TILE4D_GET_DIM1(subCoeffs[kernelIdx * strideX]); + superKPitch1 = XAI_TILE4D_GET_DIM1_PITCH(superCoeffs[kernelIdx]); + superKPitch2 = XAI_TILE4D_GET_DIM2_PITCH(superCoeffs[kernelIdx]); + + for (subKIdx = 0; subKIdx < strideX; subKIdx++) + { + pSubCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx * strideX + subKIdx]); + + subkW = XAI_TILE4D_GET_DIM3(subCoeffs[kernelIdx * strideX + subKIdx]); + + subKPitch1 = XAI_TILE4D_GET_DIM1_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]); + subKPitch2 = XAI_TILE4D_GET_DIM2_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]); + subKPitch3 = XAI_TILE4D_GET_DIM3_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]); + + outIdx = numInChSubCoeff * subKIdx; + + for (ky = 0, kIdy = 0; ky < kH; ky++, kIdy++) /* H */ + { + for (kx = 0, kIdx = 0; kx < kW; kx++, kIdx++) /* W */ + { + /*In case of super kernels we have the first sub kernel width/height as the width/height of the superkernel */ + /*In case the widths of the subkernel are not equal then we skip by differnce and start filling */ + /*Once the convolution is done the output junk data apprears at the end of the outtile. */ + /*In case of unequal heights this is handled using pointers in test app. */ + if ((subkW < kW) && (kx == 0)) + { + outIdx += superKPitch2; + kIdx--; + continue; + } + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (inCh = 0; inCh < numInChSubCoeff; inCh++) /* D */ + { + inIdx = kIdy * subKPitch3 + kIdx * subKPitch2 + \ + outCh * subKPitch1 + inCh; + pSuperCoeff[outIdx++] = pSubCoeff[inIdx]; + } + outIdx += (superKPitch1 - numInChSubCoeff); + } + } + } + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for coefficient reordering */ +/* The functions does the following: */ +/* - Flips the coefficients across width and height which is */ +/* controlled by transposeCoeffsFlag. */ +/* - Breaks the kernel into sub-kernels. */ +/* Inputs : Input Coeff Tile, CNN convolution params structure, */ +/* transposeCoeffsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub Tiles */ +/* Assumptions : CoeffData is S8/U8 */ +/* Coeff is in DWH format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvReOrder3D_I8_DWH(const xai_pTile3D inTile, + xai_pTile3D subCoeffs[], + const xai_cnn_depthwiseDilatedConv_params *param, + const uint8_t transposeCoeffsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_I8(inTile); + XAI_CHECK_TILE3D_DATA_ORDER(inTile, XAI_DWH); + XAI_CHECK_POINTER(param); + XAI_CHECK_POINTER(subCoeffs); + XAI_CHECK_ERROR(((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) >= 1) && \ + (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) <= XAI_TILE3D_GET_DIM2(inTile))) && \ + ((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) >= 1) && \ + (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param) <= XAI_TILE3D_GET_DIM3(inTile))), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \ + \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param), XAI_TILE3D_GET_DIM2(inTile), \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param), XAI_TILE3D_GET_DIM3(inTile)); + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) == 1), \ + XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) == XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param)); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + for (kIdy = 0; kIdy < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param); kIdy++) + { + for (kIdx = 0; kIdx < XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param); kIdx++) + { + kernelIdx = kIdy * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param) + kIdx; + XAI_CHECK_TILE3D_I8(subCoeffs[kernelIdx]); + XAI_CHECK_TILE3D_DATA_ORDER(subCoeffs[kernelIdx], XAI_DWH); + } + } + } + + int8_t *pInCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(inTile); + + + const int32_t numInCh = XAI_TILE3D_GET_DIM1(inTile); /* D */ + const int32_t kWidth = XAI_TILE3D_GET_DIM2(inTile); /* W */ + const int32_t kHeight = XAI_TILE3D_GET_DIM3(inTile); /* H */ + + const uint8_t strideX = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param); + + int32_t inCoeffPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + int32_t inCoeffPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + + int32_t kx, ky, inCh, inIdx, outIdx = 0; + int32_t kxStart, kyStart; + + if (transposeCoeffsFlag) + { + /* transposing of kernels and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + int8_t *pSubCoeff = (int8_t *) XAI_TILE3D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY); + + for (ky = kyStart; ky >= 0; ky -= strideY) /* H */ + { + kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX); + + for (kx = kxStart; kx >= 0; kx -= strideX) /* W */ + { + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + inIdx = ky * inCoeffPitch2 + kx * inCoeffPitch1 + inCh; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + /* For stride alignment */ + outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0; + } + } + } + } + } + else + { + /* Formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + int8_t *pSubCoeff = (int8_t *) XAI_TILE3D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = ((kHeight + strideY - kIdy - 1) % strideY); + + for (ky = kyStart; ky < kHeight; ky += strideY) /* H */ + { + kxStart = ((kWidth + strideX - kIdx - 1) % strideX); + + for (kx = kxStart; kx < kWidth; kx += strideX) /* W */ + { + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + inIdx = ky * inCoeffPitch2 + kx * inCoeffPitch1 + inCh; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + /* For stride alignment */ + outIdx += (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH)) ? ((2 * XCHAL_IVPN_SIMD_WIDTH) - (outIdx % (2 * XCHAL_IVPN_SIMD_WIDTH))) : 0; + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Vision P6 implementation for interleaving the outputs */ +/* generated by convolution functions using the sub-kernels */ +/* Inputs : array of output tiles passed as input, CNN convolution */ +/* params structure, output tile */ +/* Outputs : XI Error Code */ +/* InOuts : output tile */ +/* Assumptions : Input Tile Data is S8/U8 */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[], + xai_pTile3D outTile, + const xai_cnn_conv_params *convParams) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(inTile); + XAI_CHECK_POINTER(convParams); + XAI_CHECK_TILE3D_I8(outTile); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(outTile); + } + /* Getting parameters from the tile structures */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(convParams); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(convParams); + + const int32_t outDataPitch1Offset = (outDataPitch1 * strideY); + + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t ch, x, y, numX, numY, idx, remX; + int8_t *pSubKernelOutput; + int8_t *pOutput1; + int8_t *pOutput2; + int8_t *pInput1; + int8_t *pInput2; + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR(((strideX > 0) && (strideY > 0)), XAI_ERR_BADARG, \ + "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outTile) >= strideX) && \ + (XAI_TILE3D_GET_DIM2(outTile) >= strideY), XAI_ERR_BADARG, \ + "\nOutTile width = %d, value must be greater than or equal to %hhu(strideX) \ + \nOutTile height = %d, value must be greater than or equal to %hhu(strideY)", \ + XAI_TILE3D_GET_DIM1(outTile), strideX, XAI_TILE3D_GET_DIM2(outTile), strideY); + + for (numY = 0; numY < strideY; numY++) + { + for (numX = 0; numX < strideX; numX++) + { + idx = numX + numY * strideX; + XAI_CHECK_POINTER(inTile[idx]); + XAI_CHECK_TILE3D_I8(inTile[idx]); + XAI_CHECK_TILE3D_DATA_ORDER(inTile[idx], XAI_WHD); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile[idx], outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile[idx]) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_BADARG, \ + "\nNumber of channels of each subkernel output = %d, final output = %d \ + \nNumber of channels of each subkernel output and final output should be the same", \ + XAI_TILE3D_GET_DIM3(inTile[idx]), XAI_TILE3D_GET_DIM3(outTile)); + } + } + } + + /* Scatter Index Calculations */ + /* Sequence - 0 1 2 3 4 ... 30 31 */ + xb_vecNx16U vecSelIdx1 = IVP_SEQNX16U(); + /* Sequence - 0 strideX 2*strideX 3*strideX 4*strideX .... 30*strideX 31*strideX*/ + xb_vecNx16U vecScatterOff1 = IVP_MULNX16UPACKL(vecSelIdx1, \ + (uint16_t) strideX); + + xb_vecNx16U vecScatterOff2; + /* Sequence - (32*strideX) (33*strideX) (34*strideX) ....(62*strideX) (63*strideX)*/ + vecScatterOff2 = IVP_ADDNX16(vecScatterOff1, (XCHAL_IVPN_SIMD_WIDTH * strideX)); + + xb_vec2Nx8* restrict pdvecIn1; + xb_vec2Nx8 dvecData1; + valign vaInData1; + vbool2N vecMsk; + vboolN vecOffsetMsk1; + vboolN vecOffsetMsk2; + /* Sequence - 0 1 2 3 4 ... 62 63 */ + xb_vec2Nx8 vecCmp = IVP_SEQ2NX8U(); + /* Sequence - 0 1 2 3 4 ... 30 31 */ + xb_vecNx16U vecOffsetCmp = IVP_SEQNX16U(); + + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + + for (numY = 0; numY < strideY; numY++) + { + for (numX = 0; numX < strideX; numX++) + { + idx = numX + numY * strideX; + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile[idx]); + const int32_t inDataWidth = XAI_TILE3D_GET_DIM1(inTile[idx]); + const int32_t inDataHeight = XAI_TILE3D_GET_DIM2(inTile[idx]); + const int32_t inChanNum = XAI_TILE3D_GET_DIM3(inTile[idx]); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile[idx]); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile[idx]); + pSubKernelOutput = (pOutput + numX + (numY * outDataPitch1)); + for (ch = 0; ch < inChanNum; ch++) + { + pOutput1 = (pSubKernelOutput + (ch * outDataPitch2)); + pInput1 = (pInput + (ch * inDataPitch2)); + for (x = 0; x <= (inDataWidth - vectorizationWidth); x += (vectorizationWidth)) + { + pInput2 = (pInput1 + x); + pOutput2 = (pOutput1 + (x * strideX)); + pdvecIn1 = (xb_vec2Nx8 *) pInput2; + for (y = 0; y < inDataHeight; y++) + { + vaInData1 = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1); + IVP_SCATTER2NX8_L(dvecData1, pOutput2, vecScatterOff1); + IVP_SCATTER2NX8_H(dvecData1, pOutput2, vecScatterOff2); + pOutput2 += outDataPitch1Offset; + } + } + /*To perform Interleaving for inputData widths that are less than the vectorization width*/ + if (inDataWidth - x) + { + pInput2 = (pInput1 + x); + pOutput2 = ((pOutput1 + (x * strideX))); + pdvecIn1 = (xb_vec2Nx8 *) pInput2; + remX = (inDataWidth - x); + /*Creating Mask to scatter only the availble valid inputs that should be interleaved*/ + vecMsk = IVP_LT2NX8(vecCmp, remX); + /*Creating Mask for the scatter operation to have only valid offsets based on the available inputs*/ + vecOffsetMsk1 = IVP_LTNX16(vecOffsetCmp, remX); + vecOffsetMsk2 = IVP_LTNX16(vecOffsetCmp, (remX - XCHAL_IVPN_SIMD_WIDTH)); + for (y = 0; y < inDataHeight; y++) + { + vaInData1 = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1); + IVP_SCATTER2NX8T_L(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff1, 0, vecOffsetMsk1), (vecMsk)); + IVP_SCATTER2NX8T_H(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff2, 0, vecOffsetMsk2), (vecMsk)); + pOutput2 += outDataPitch1Offset; + } + } + } + } + } + + IVP_SCATTERW(); /* Adding Memory Wait until all the scatter and store operations are completed */ + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Vision P6 implementation for interleaving the outputs */ +/* generated by convolution functions using the sub-kernels */ +/* Inputs : array of output tiles passed as input, CNN convolution */ +/* params structure, output tile */ +/* Outputs : XI Error Code */ +/* InOuts : output tile */ +/* Assumptions : Input Tile Data is S8/U8 */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDepthwiseDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[], + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *convParams) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(inTile); + XAI_CHECK_POINTER(convParams); + XAI_CHECK_TILE3D_I8(outTile); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(outTile); + } + /* Getting parameters from the tile structures */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const uint8_t strideX = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(convParams); + const uint8_t strideY = XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(convParams); + + const int32_t outDataPitch1Offset = (outDataPitch1 * strideY); + + int8_t *pOutput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t ch, x, y, numX, numY, idx, remX; + int8_t *pSubKernelOutput; + int8_t *pOutput1; + int8_t *pOutput2; + int8_t *pInput1; + int8_t *pInput2; + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR(((strideX > 0) && (strideY > 0)), + XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outTile) >= strideX) && \ + (XAI_TILE3D_GET_DIM2(outTile) >= strideY), XAI_ERR_BADARG, \ + "\nOutTile width = %d, value must be greater than or equal to %hhu(strideX) \ + \nOutTile height = %d, value must be greater than or equal to %hhu(strideY)", \ + XAI_TILE3D_GET_DIM1(outTile), strideX, XAI_TILE3D_GET_DIM2(outTile), strideY); + + for (numY = 0; numY < strideY; numY++) + { + for (numX = 0; numX < strideX; numX++) + { + idx = numX + numY * strideX; + XAI_CHECK_POINTER(inTile[idx]); + XAI_CHECK_TILE3D_I8(inTile[idx]); + XAI_CHECK_TILE3D_DATA_ORDER(inTile[idx], XAI_WHD); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile[idx], outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile[idx]) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_BADARG, \ + "\nNumber of channels of each subkernel output = %d, final output = %d \ + \nNumber of channels of each subkernel output and final output should be the same", \ + XAI_TILE3D_GET_DIM3(inTile[idx]), XAI_TILE3D_GET_DIM3(outTile)); + } + } + } + + /* Scatter Index Calculations */ + /* Sequence - 0 1 2 3 4 ... 30 31 */ + xb_vecNx16U vecSelIdx1 = IVP_SEQNX16U(); + /* Sequence - 0 strideX 2*strideX 3*strideX 4*strideX .... 30*strideX 31*strideX*/ + xb_vecNx16U vecScatterOff1 = IVP_MULNX16UPACKL(vecSelIdx1, \ + (uint16_t) strideX); + + xb_vecNx16U vecScatterOff2; + /* Sequence - (32*strideX) (33*strideX) (34*strideX) ....(62*strideX) (63*strideX)*/ + vecScatterOff2 = IVP_ADDNX16(vecScatterOff1, (XCHAL_IVPN_SIMD_WIDTH * strideX)); + + xb_vec2Nx8* restrict pdvecIn1; + xb_vec2Nx8 dvecData1; + valign vaInData1; + vbool2N vecMsk; + vboolN vecOffsetMsk1; + vboolN vecOffsetMsk2; + /* Sequence - 0 1 2 3 4 ... 62 63 */ + xb_vec2Nx8 vecCmp = IVP_SEQ2NX8U(); + /* Sequence - 0 1 2 3 4 ... 30 31 */ + xb_vecNx16U vecOffsetCmp = IVP_SEQNX16U(); + + const int32_t vectorizationWidth = 2 * XCHAL_IVPN_SIMD_WIDTH; + + for (numY = 0; numY < strideY; numY++) + { + for (numX = 0; numX < strideX; numX++) + { + idx = numX + numY * strideX; + int8_t *pInput = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile[idx]); + const int32_t inDataWidth = XAI_TILE3D_GET_DIM1(inTile[idx]); + const int32_t inDataHeight = XAI_TILE3D_GET_DIM2(inTile[idx]); + const int32_t inChanNum = XAI_TILE3D_GET_DIM3(inTile[idx]); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile[idx]); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile[idx]); + pSubKernelOutput = (pOutput + numX + (numY * outDataPitch1)); + for (ch = 0; ch < inChanNum; ch++) + { + pOutput1 = (pSubKernelOutput + (ch * outDataPitch2)); + pInput1 = (pInput + (ch * inDataPitch2)); + for (x = 0; x <= (inDataWidth - vectorizationWidth); x += (vectorizationWidth)) + { + pInput2 = (pInput1 + x); + pOutput2 = (pOutput1 + (x * strideX)); + pdvecIn1 = (xb_vec2Nx8 *) pInput2; + for (y = 0; y < inDataHeight; y++) + { + vaInData1 = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1); + IVP_SCATTER2NX8_L(dvecData1, pOutput2, vecScatterOff1); + IVP_SCATTER2NX8_H(dvecData1, pOutput2, vecScatterOff2); + pOutput2 += outDataPitch1Offset; + } + } + /*To perform Interleaving for inputData widths that are less than the vectorization width*/ + if (inDataWidth - x) + { + pInput2 = (pInput1 + x); + pOutput2 = ((pOutput1 + (x * strideX))); + pdvecIn1 = (xb_vec2Nx8 *) pInput2; + remX = (inDataWidth - x); + /*Creating Mask to scatter only the availble valid inputs that should be interleaved*/ + vecMsk = IVP_LT2NX8(vecCmp, remX); + /*Creating Mask for the scatter operation to have only valid offsets based on the available inputs*/ + vecOffsetMsk1 = IVP_LTNX16(vecOffsetCmp, remX); + vecOffsetMsk2 = IVP_LTNX16(vecOffsetCmp, (remX - XCHAL_IVPN_SIMD_WIDTH)); + for (y = 0; y < inDataHeight; y++) + { + vaInData1 = IVP_LA2NX8_PP(pdvecIn1); + IVP_LA2NX8_XP(dvecData1, vaInData1, pdvecIn1, inDataPitch1); + IVP_SCATTER2NX8T_L(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff1, 0, vecOffsetMsk1), (vecMsk)); + IVP_SCATTER2NX8T_H(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff2, 0, vecOffsetMsk2), (vecMsk)); + pOutput2 += outDataPitch1Offset; + } + } + } + } + } + + IVP_SCATTERW(); /* Adding Memory Wait until all the scatter and store operations are completed */ + + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Vision P6 implementation for interleaving the outputs */ +/* generated by convolution functions using the sub-kernels */ +/* Inputs : array of output tiles passed as input, CNN convolution */ +/* params structure, output tile */ +/* Outputs : XI Error Code */ +/* InOuts : output tile */ +/* Assumptions : Input Tile Data is I16 */ +/****************************************************************************/ + +XAI_ERR_TYPE xaiDeConvInterleave3D_I16_WHD(const xai_pTile3D inTile[], + xai_pTile3D outTile, + const xai_cnn_conv_params *convParams) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(inTile); + XAI_CHECK_POINTER(convParams); + XAI_CHECK_TILE3D_I16(outTile); + XAI_CHECK_TILE3D_DATA_ORDER(outTile, XAI_WHD); + XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(outTile); + } + + /* Getting parameters from the tile structures */ + const int32_t outDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(convParams); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(convParams); + const int32_t outDataPitch1Offset = (outDataPitch1 * strideY); + + int16_t *pOutput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t ch, x, y, numX, numY, idx, remX; + int16_t *pSubKernelOutput; + int16_t *pOutput1; + int16_t *pOutput2; + int16_t *pInput1; + int16_t *pInput2; + + + XAI_ERROR_CHECKS_CONTINUE() + { + XAI_CHECK_ERROR(((strideX > 0) && (strideY > 0)), + XAI_ERR_BADARG, "strideX = %hhu, strideY = %hhu\nStride has to be >= 1", \ + strideX, strideY); + + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outTile) >= strideX) && \ + (XAI_TILE3D_GET_DIM2(outTile) >= strideY), XAI_ERR_BADARG, \ + "\nOutTile width = %d, value must be greater than or equal to %hhu(strideX) \ + \nOutTile height = %d, value must be greater than or equal to %hhu(strideY)", \ + XAI_TILE3D_GET_DIM1(outTile), strideX, XAI_TILE3D_GET_DIM2(outTile), strideY); + + for (numY = 0; numY < strideY; numY++) + { + for (numX = 0; numX < strideX; numX++) + { + idx = numX + numY * strideX; + XAI_CHECK_POINTER(inTile[idx]); + XAI_CHECK_TILE3D_I16(inTile[idx]); + XAI_CHECK_TILE3D_DATA_ORDER(inTile[idx], XAI_WHD); + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile[idx], outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile[idx]) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_BADARG, \ + "\nNumber of channels of each subkernel output = %d, final output = %d \ + \nNumber of channels of each subkernel output and final output should be the same", \ + XAI_TILE3D_GET_DIM3(inTile[idx]), XAI_TILE3D_GET_DIM3(outTile)); + } + } + } + + /* Scatter Index Calculations */ + /* Sequence - 0 1 2 3 4 ... 30 31 */ + xb_vecNx16U vecSelIdx1 = IVP_SEQNX16U(); + /* Sequence - 0 strideX 2*strideX 3*strideX 4*strideX .... 30*strideX 31*strideX*/ + xb_vecNx16U vecScatterOff1 = IVP_MULNX16UPACKL(vecSelIdx1, \ + (uint16_t) strideX * 2); + + xb_vecNx16* restrict pdvecIn1; + xb_vecNx16 dvecData1; + valign vaInData1; + vboolN vecMsk; + vboolN vecOffsetMsk1; + /* Sequence - 0 1 2 3 4 ... 30 31 */ + xb_vecNx16U vecCmp = IVP_SEQNX16U(); + xb_vecNx16U vecOffsetCmp = IVP_SEQNX16U(); + + + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH; + + for (numY = 0; numY < strideY; numY++) + { + for (numX = 0; numX < strideX; numX++) + { + idx = numX + numY * strideX; + int16_t *pInput = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile[idx]); + const int32_t inDataWidth = XAI_TILE3D_GET_DIM1(inTile[idx]); + const int32_t inDataHeight = XAI_TILE3D_GET_DIM2(inTile[idx]); + const int32_t inChanNum = XAI_TILE3D_GET_DIM3(inTile[idx]); + const int32_t inDataPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile[idx]); + const int32_t inDataPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile[idx]); + pSubKernelOutput = (pOutput + numX + (numY * outDataPitch1)); + for (ch = 0; ch < inChanNum; ch++) + { + pOutput1 = (pSubKernelOutput + (ch * outDataPitch2)); + pInput1 = (pInput + (ch * inDataPitch2)); + for (x = 0; x <= (inDataWidth - vectorizationWidth); x += (vectorizationWidth)) + { + pInput2 = (pInput1 + x); + pOutput2 = (pOutput1 + (x * strideX)); + pdvecIn1 = (xb_vecNx16 *) pInput2; + for (y = 0; y < inDataHeight; y++) + { + vaInData1 = IVP_LANX16_PP(pdvecIn1); + IVP_LANX16_XP(dvecData1, vaInData1, pdvecIn1, (inDataPitch1 << 1)); + IVP_SCATTERNX16(dvecData1, pOutput2, vecScatterOff1); + pOutput2 += outDataPitch1Offset; + } + } + /*To perform Interleaving for inputData widths that are less than the vectorization width*/ + if (inDataWidth - x) + { + pInput2 = (pInput1 + x); + pOutput2 = ((pOutput1 + (x * strideX))); + pdvecIn1 = (xb_vecNx16 *) (pInput2); + remX = (inDataWidth - x); + /*Creating Mask to scatter only the availble valid inputs that should be interleaved*/ + vecMsk = IVP_LTNX16(vecCmp, remX); + /*Creating Mask for the scatter operation to have only valid offsets based on the available inputs*/ + vecOffsetMsk1 = IVP_LTNX16(vecOffsetCmp, remX); + for (y = 0; y < inDataHeight; y++) + { + vaInData1 = IVP_LANX16_PP(pdvecIn1); + IVP_LANX16_XP(dvecData1, vaInData1, pdvecIn1, (inDataPitch1 << 1)); + IVP_SCATTERNX16T(dvecData1, pOutput2, IVP_MOVNX16T(vecScatterOff1, 0, vecOffsetMsk1), (vecMsk)); + pOutput2 += outDataPitch1Offset; + } + } + } + } + } + + IVP_SCATTERW(); /* Adding Memory Wait until all the scatter and store operations are completed */ + + return(XAI_ERROR_STATUS()); +} + +/**********************xaiConvolvedBiasUpdate_S8S32*************************/ +/* Description : Implementation of BiasUpdate calculation for */ +/* It modifies the bias value by adding a fixup */ +/* term to it. This function is called along with, */ +/* Convolved3D_MOD functions which accepts U8 input tile */ +/* and converts to S8 and also S8 coeff tile */ +/* Inputs : Coeff Tile */ +/* InOuts : biasArray */ +/* Assumptions : coeffData is S8 and biasData is S32 */ +/* Coefficient tile is in NDWH format */ +/**************************************************************************/ +XAI_ERR_TYPE xaiConvolvedBiasUpdate_S8S32(const xai_pTile4D coeffTile, + xai_pArray biasArray + ) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_S8(coeffTile); + XAI_CHECK_ARRAY_S32(biasArray); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTile, XAI_NDWH); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTile) <= XAI_ARRAY_GET_WIDTH(biasArray)), XAI_ERR_BADARG, \ + "\nNumber of Kernels = %d, Width of Bias Array = %d\nNumber of Kernels must be less than or equal to Width of Bias Array", \ + XAI_TILE4D_GET_DIM1(coeffTile), XAI_ARRAY_GET_WIDTH(biasArray)); + } +#ifndef IVP_MULSUQA2N8XR8 + /* Data Pointers of input, output, coefficient and bias data */ + int8_t *pCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTile); + int32_t *pBias = (int32_t *) XAI_ARRAY_GET_DATA_PTR(biasArray); + + /* Vector Pointers */ + xb_vec2Nx8* restrict pdvecCoeff; + xb_vecN_2x32v* restrict phvecBias = (xb_vecN_2x32v *) (pBias); + xb_vecN_2x32v* phvecBiasIn = phvecBias; + xb_vecN_2x32v* phvecBiasOut = phvecBias; + valign vaInBias = IVP_LAN_2X32_PP(phvecBiasIn); + valign vaOutBias = IVP_ZALIGN(); + + /* Getting parameters from the tile structures */ + const int32_t outChanNum = XAI_TILE4D_GET_DIM1(coeffTile); + const int32_t inChanNum = XAI_TILE4D_GET_DIM2(coeffTile); + const uint16_t kWidthU = XAI_TILE4D_GET_DIM3(coeffTile); + const uint16_t kHeightU = XAI_TILE4D_GET_DIM4(coeffTile); + const int32_t coeffDataPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTile); + const int32_t coeffDataPitch2 = XAI_TILE4D_GET_DIM2_PITCH(coeffTile); + const int32_t coeffDataPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTile); + int32_t accOverflowFlag = 0; + + int32_t outCh, kx, ky, inCh; + /* + IF inputdata is S8 + convolutionS8 = summation(InputData * CoeffData) + IF inputdata is U8 + convolutionU8 = summation((InputData - 128) * CoeffData) + summation(128 * CoeffData) + = convolutionS8 + summation(128 * CoeffData) + = convolutionS8 + 128 * summation( CoeffData) + 128 * summation( CoeffData) is performed below + */ + + const int32_t vectorizationWidth = (XCHAL_IVPN_SIMD_WIDTH << 1); + + /* Iterate Over OutChannels */ + for (outCh = 0; outCh < outChanNum; outCh += vectorizationWidth) + { + /* Calculate remaining output channels */ + int32_t remOutCh = (outChanNum - outCh); + + /* Initialize Accumulator Vector */ + xb_vec2Nx24 daccSum = IVP_ZERO2NX24(); + + /* Computes the sum of coeffs corresponding to the same outChannel */ + for (ky = 0; ky < kHeightU; ky++) + { + for (kx = 0; kx < kWidthU; kx++) + { + int32_t coeffIdx = outCh + kx * coeffDataPitch2 + ky * coeffDataPitch3; + pdvecCoeff = (xb_vec2Nx8 *) (pCoeff + coeffIdx); + + for (inCh = 0; inCh < inChanNum - 3; inCh += 4) + { + xb_vec2Nx8 dvecCoeff1, dvecCoeff2, dvecCoeff3, dvecCoeff4; + + IVP_L2U2NX8_XP(dvecCoeff1, pdvecCoeff, coeffDataPitch1); + IVP_L2U2NX8_XP(dvecCoeff2, pdvecCoeff, coeffDataPitch1); + IVP_L2U2NX8_XP(dvecCoeff3, pdvecCoeff, coeffDataPitch1); + IVP_L2U2NX8_XP(dvecCoeff4, pdvecCoeff, coeffDataPitch1); + + IVP_ADDWA2NX8(daccSum, dvecCoeff2, dvecCoeff1); + IVP_ADDWA2NX8(daccSum, dvecCoeff4, dvecCoeff3); + } + for (; inCh < inChanNum - 1; inCh += 2) + { + xb_vec2Nx8 dvecCoeff1, dvecCoeff2; + + IVP_L2U2NX8_XP(dvecCoeff1, pdvecCoeff, coeffDataPitch1); + IVP_L2U2NX8_XP(dvecCoeff2, pdvecCoeff, coeffDataPitch1); + + IVP_ADDWA2NX8(daccSum, dvecCoeff2, dvecCoeff1); + } + if (inCh < inChanNum) + { + xb_vec2Nx8 dvecCoeff; + + IVP_L2U2NX8_XP(dvecCoeff, pdvecCoeff, coeffDataPitch1); + + IVP_ADDWA2NX8(daccSum, (xb_vec2Nx8) 0, dvecCoeff); + } + } + } + + /* Add Adjustment for Bias to Bias Vectors */ + xb_vecN_2x32v hvecBiasLL, hvecBiasLH, hvecBiasHL, hvecBiasHH; + int32_t remBiasBytes = remOutCh * 4; + + /* Number of channels processed by N_2-way 32-bit vector */ + const int32_t numProcessCh = XCHAL_IVPN_SIMD_WIDTH >> 1; + + /* Convert Accumulated Double Accumulator Values to 4 Half Vectors */ + xb_vecN_2x32v hvecAccLL, hvecAccLH, hvecAccHL, hvecAccHH; + hvecAccLL = IVP_CVT32S2NX24LL(daccSum); hvecAccLL = IVP_SLAN_2X32(hvecAccLL, 7); + hvecAccLH = IVP_CVT32S2NX24LH(daccSum); hvecAccLH = IVP_SLAN_2X32(hvecAccLH, 7); + hvecAccHL = IVP_CVT32S2NX24HL(daccSum); hvecAccHL = IVP_SLAN_2X32(hvecAccHL, 7); + hvecAccHH = IVP_CVT32S2NX24HH(daccSum); hvecAccHH = IVP_SLAN_2X32(hvecAccHH, 7); + + hvecAccLL = IVP_MOVN_2X32T(hvecAccLL, (xb_vecN_2x32v) 0, \ + IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh))); + hvecAccLH = IVP_MOVN_2X32T(hvecAccLH, (xb_vecN_2x32v) 0, \ + IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh - (numProcessCh)))); + hvecAccHL = IVP_MOVN_2X32T(hvecAccHL, (xb_vecN_2x32v) 0, \ + IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh - (2 * numProcessCh)))); + hvecAccHH = IVP_MOVN_2X32T(hvecAccHH, (xb_vecN_2x32v) 0, \ + IVP_LTN_2X32(IVP_SEQN_2X32(), (xb_vecN_2x32v) (remOutCh - (3 * numProcessCh)))); + + IVP_LAVN_2X32_XP(hvecBiasLL, vaInBias, phvecBiasIn, remBiasBytes); + IVP_LAVN_2X32_XP(hvecBiasLH, vaInBias, phvecBiasIn, remBiasBytes - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAVN_2X32_XP(hvecBiasHL, vaInBias, phvecBiasIn, remBiasBytes - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_LAVN_2X32_XP(hvecBiasHH, vaInBias, phvecBiasIn, remBiasBytes - (6 * XCHAL_IVPN_SIMD_WIDTH)); + + /* Add Bias and its Adjustment */ + hvecBiasLL = IVP_ADDN_2X32(hvecBiasLL, hvecAccLL); + hvecBiasLH = IVP_ADDN_2X32(hvecBiasLH, hvecAccLH); + hvecBiasHL = IVP_ADDN_2X32(hvecBiasHL, hvecAccHL); + hvecBiasHH = IVP_ADDN_2X32(hvecBiasHH, hvecAccHH); + + /* Check If Overflow is present and perform shifts as per requirement*/ + vboolN_2 hvbOverflow; + + /* hvecBiasLL */ + hvbOverflow = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasLL, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasLL)); + accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow); + hvecBiasLL = IVP_SLAN_2X32(hvecBiasLL, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow)); + hvecBiasLL = IVP_SLAN_2X32(hvecBiasLL, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow)); + + /* hvecBiasLH */ + hvbOverflow = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasLH, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasLH)); + accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow); + hvecBiasLH = IVP_SLAN_2X32(hvecBiasLH, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow)); + hvecBiasLH = IVP_SLAN_2X32(hvecBiasLH, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow)); + + /* hvecBiasHL */ + hvbOverflow = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasHL, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasHL)); + accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow); + hvecBiasHL = IVP_SLAN_2X32(hvecBiasHL, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow)); + hvecBiasHL = IVP_SLAN_2X32(hvecBiasHL, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow)); + + /* hvecBiasHH */ + hvbOverflow = IVP_ORBN_2(IVP_LTN_2X32(hvecBiasHH, S24_MIN), IVP_LTN_2X32(S24_MAX, hvecBiasHH)); + accOverflowFlag += (int32_t) IVP_RADDN_2X32T((xb_vecN_2x32v) 1, hvbOverflow); + hvecBiasHH = IVP_SLAN_2X32(hvecBiasHH, IVP_MOVN_2X32T((xb_vecN_2x32v) (8), (xb_vecN_2x32v) 0, hvbOverflow)); + hvecBiasHH = IVP_SLAN_2X32(hvecBiasHH, IVP_MOVN_2X32T((xb_vecN_2x32v) (-8), (xb_vecN_2x32v) 0, hvbOverflow)); + + /* Store Updated Bias */ + IVP_SAVN_2X32_XP(hvecBiasLL, vaOutBias, phvecBiasOut, remBiasBytes); + IVP_SAVN_2X32_XP(hvecBiasLH, vaOutBias, phvecBiasOut, remBiasBytes - (2 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAVN_2X32_XP(hvecBiasHL, vaOutBias, phvecBiasOut, remBiasBytes - (4 * XCHAL_IVPN_SIMD_WIDTH)); + IVP_SAVN_2X32_XP(hvecBiasHH, vaOutBias, phvecBiasOut, remBiasBytes - (6 * XCHAL_IVPN_SIMD_WIDTH)); + } + + IVP_SAPOSN_2X32_FP(vaOutBias, phvecBiasOut); + + if (accOverflowFlag) + { + return(XAI_ERR_OVERFLOW); + } +#endif + return(XAI_ERROR_STATUS()); +} + +/************************ xaiReOrder4DToIN32DWH_I16 ***********************/ +/* Description : C-code implementation to reorder a tile from WHDN, */ +/* DWHN or NDWH into IN32DWH format */ +/* Inputs : Coeff Tile in WHDN or DWHN or NDWH format */ +/* Outputs : Coeff Array in IN32DWH format */ +/* Assumptions : The width and height of the coefficient tile are 1 */ +/* Input and Output tiles can be S16 / U16 */ +/***************************************************************************/ +XAI_ERR_TYPE xaiReOrder4DToIN32DWH_I16(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_I16(coeffTileIn); + XAI_CHECK_TILE4D_I16(coeffTileOut); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN) || (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_NDWH), \ + XAI_ERR_BADARG, "The Data Order of the input is not supported by this function"); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTileOut, XAI_IN32DWH); + XAI_CHECK_DIM_IN32DWH(coeffTileIn, coeffTileOut); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_PTR(coeffTileIn) != XAI_ARRAY_GET_DATA_PTR(coeffTileOut)), XAI_ERR_INPLACE, "The input and output tile pointers overlap"); + } + + int32_t numInCh, numOutCh, minCh, coeffInPitch1, coeffInPitch3; + + if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) + { + numInCh = XAI_TILE4D_GET_DIM3(coeffTileIn); + numOutCh = XAI_TILE4D_GET_DIM4(coeffTileIn); + coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn); + coeffInPitch1 = 1; + } + else if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN) + { + numInCh = XAI_TILE4D_GET_DIM1(coeffTileIn); + numOutCh = XAI_TILE4D_GET_DIM4(coeffTileIn); + coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn); + coeffInPitch1 = 1; + } + else /* If coeff tile NDWH */ + { + numInCh = XAI_TILE4D_GET_DIM2(coeffTileIn); + numOutCh = XAI_TILE4D_GET_DIM1(coeffTileIn); + coeffInPitch3 = 1; + coeffInPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTileIn); + } + + int16_t *pCoeff = (int16_t *) XAI_TILE4D_GET_DATA_PTR(coeffTileIn); + int16_t *pCoeffOut = (int16_t *) XAI_ARRAY_GET_DATA_PTR(coeffTileOut); + int32_t i, j, k; + + /* Reorder Coeff tile */ + /* + The coefficient tile is reordered in the format IN64DWH: + d0_0,....d0_31, d1_0,...d1_31, ....dN_0,...dN_31, d0_32,....d0_63, d1_32,...d1_63, ....dN_32,...dN_63, + d0_64,....d0_95, d1_64,...d1_95, ....dN_64,...dN_95,... + + Here, d0, d1,....dN are input channels. + where 'N' is the total input channels. + d0_0 => 0_0 => inputChNumber_outputChNumber + */ + + for (i = 0; i < numOutCh; i += XCHAL_IVPN_SIMD_WIDTH) + { + for (j = 0; j < numInCh; j++) + { + minCh = (numOutCh - i) >= XCHAL_IVPN_SIMD_WIDTH ? XCHAL_IVPN_SIMD_WIDTH : (numOutCh - i); + for (k = 0; k < minCh; k++) + { + int16_t val = *(pCoeff + (k + i) * coeffInPitch3 + j * coeffInPitch1); + *(pCoeffOut + k + (j * XCHAL_IVPN_SIMD_WIDTH) + i * numInCh) = val; + } + } + } + return(XAI_ERROR_STATUS()); +} + +/*********************** xaiReOrder4DToIN64DWH_I8 ***************************/ +/* Description : C-code implementation to reorder a tile from WHDN, */ +/* DWHN or NDWH into IN64DWH format */ +/* Inputs : Coeff Tile in WHDN or DWHN or NDWH format */ +/* Outputs : Coeff Array in IN64DWH format */ +/* Assumptions : The width and height of the coefficient tile are 1 */ +/* Input and Output tiles can be S16 / U16 */ +/***************************************************************************/ +XAI_ERR_TYPE xaiReOrder4DToIN64DWH_I8(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_I8(coeffTileIn); + XAI_CHECK_TILE4D_I8(coeffTileOut); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN) || (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_NDWH), \ + XAI_ERR_BADARG, "The Data Order of the input is not supported by this function"); + XAI_CHECK_TILE4D_DATA_ORDER(coeffTileOut, XAI_IN64DWH); + XAI_CHECK_DIM_IN64DWH(coeffTileIn, coeffTileOut); + XAI_CHECK_ERROR((XAI_TILE4D_GET_DATA_PTR(coeffTileIn) != XAI_ARRAY_GET_DATA_PTR(coeffTileOut)), XAI_ERR_INPLACE, "The input and output tile pointers overlap"); + } + int32_t numInCh, numOutCh, minCh, coeffInPitch1, coeffInPitch3; + + if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) + { + numInCh = XAI_TILE4D_GET_DIM3(coeffTileIn); + numOutCh = XAI_TILE4D_GET_DIM4(coeffTileIn); + coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn); + coeffInPitch1 = 1; + } + else if (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN) + { + numInCh = XAI_TILE4D_GET_DIM1(coeffTileIn); + numOutCh = XAI_TILE4D_GET_DIM4(coeffTileIn); + coeffInPitch3 = XAI_TILE4D_GET_DIM3_PITCH(coeffTileIn); + coeffInPitch1 = 1; + } + else /* If coeff tile NDWH */ + { + numInCh = XAI_TILE4D_GET_DIM2(coeffTileIn); + numOutCh = XAI_TILE4D_GET_DIM1(coeffTileIn); + coeffInPitch3 = 1; + coeffInPitch1 = XAI_TILE4D_GET_DIM1_PITCH(coeffTileIn); + } + + int8_t *pCoeff = (int8_t *) XAI_TILE4D_GET_DATA_PTR(coeffTileIn); + int8_t *pCoeffOut = (int8_t *) XAI_ARRAY_GET_DATA_PTR(coeffTileOut); + int32_t i, j, k; + + /* Reorder Coeff tile */ + /* + The coefficient tile is reordered in the format IN64DWH: + d0_0,....d0_63, d1_0,...d1_63, ....dN_0,...dN_63, d0_64,....d0_127, d1_64,...d1_127, ....dN_64,...dN_127, + d0_128,....d0_191, d1_128,...d1_191, ....dN_128,...dN_191,... + + Here, d0, d1,....dN are input channels. + where 'N' is the total input channels. + d0_0 => 0_0 => inputChNumber_outputChNumber + */ + + for (i = 0; i < numOutCh; i += 2 * XCHAL_IVPN_SIMD_WIDTH) + { + for (j = 0; j < numInCh; j++) + { + minCh = (numOutCh - i) >= (2 * XCHAL_IVPN_SIMD_WIDTH) ? (2 * XCHAL_IVPN_SIMD_WIDTH) : (numOutCh - i); + for (k = 0; k < minCh; k++) + { + int8_t val = *(pCoeff + (k + i) * coeffInPitch3 + j * coeffInPitch1); + *(pCoeffOut + k + (j * 2 * XCHAL_IVPN_SIMD_WIDTH) + i * numInCh) = val; + } + } + } + return(XAI_ERROR_STATUS()); +} + +#if 0 //(XCHAL_HAVE_VISION_HP_VFPU == 1) // Disabled the F16 helper APIs which are not used anywhere + +/****************************************************************************/ +/* Description : Implementation for extending the bias array in */ +/* case of MOD deconvolution using superkernels. */ +/* Inputs : Input Bias array, */ +/* Outputs : XI Error Code */ +/* InOuts : Output Bias array */ +/****************************************************************************/ +XAI_ERR_TYPE xaiBiasExtend_F16_MOD(const xai_pArray inBiasArray, + xai_pArray outBiasArray) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_ARRAY_F16(inBiasArray); + XAI_CHECK_ARRAY_F16(outBiasArray); + } + + int32_t inWidth = XAI_ARRAY_GET_WIDTH(inBiasArray); + int32_t outWidth = XAI_ARRAY_GET_WIDTH(outBiasArray); + int32_t strideX = outWidth / inWidth; + + xb_f16* pInBias = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(inBiasArray); + xb_f16* pOutBias = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(outBiasArray); + + int32_t numX, inW; + for (numX = 0; numX < strideX; numX++) + { + for (inW = 0; inW < inWidth; inW++) + { + pOutBias[inW + inWidth * numX] = pInBias[inW]; + } + } + return(XAI_ERROR_STATUS()); +} + +/*****************************************************************************/ +/* Description : Implementation for extending the outputscale array */ +/* in case of MOD deconvolution using superkernels. */ +/* Inputs : outputScale array, */ +/* Outputs : XI Error Code */ +/* InOuts : extended outputScale array */ +/*****************************************************************************/ +XAI_ERR_TYPE xaiOutScaleExtend_F16_MOD(const xai_pArray outScaleArray, + xai_pArray extendedOutScaleArray) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_ARRAY_F16(outScaleArray); + XAI_CHECK_ARRAY_F16(extendedOutScaleArray); + } + + int32_t inWidth = XAI_ARRAY_GET_WIDTH(outScaleArray); + int32_t outWidth = XAI_ARRAY_GET_WIDTH(extendedOutScaleArray); + int32_t strideX = outWidth / inWidth; + + xb_f16* pInScale = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(outScaleArray); + xb_f16* pOutScale = (xb_f16 *) XAI_ARRAY_GET_DATA_PTR(extendedOutScaleArray); + + int32_t numX, inW; + for (numX = 0; numX < strideX; numX++) + { + for (inW = 0; inW < inWidth; inW++) + { + pOutScale[inW + inWidth * numX] = pInScale[inW]; + } + } + return(XAI_ERROR_STATUS()); +} + +/****************************************************************************/ +/* Description : Implementation for coefficient reordering */ +/* The functions does the following: */ +/* - Convert from NDWH->DNWH */ +/* - Flips the coefficients across width and height which is */ +/* controlled by transposeCoeffsFlag. */ +/* - Breaks the kernel into sub-kernels. */ +/* - Stacks sub-kernels to form super kernels. */ +/* Inputs : Input Coeff Tile, CNN convolution params structure, */ +/* transposeCoeffsFlag */ +/* Outputs : XI Error Code */ +/* InOuts : Array of Coeff Sub & Super Tiles */ +/* Assumptions : CoeffData is F16 */ +/* Coeff is in NDWH format */ +/****************************************************************************/ +XAI_ERR_TYPE xaiDeConvReOrder4D_F16_NDWH(const xai_pTile4D inTile, + xai_pTile4D subCoeffs[], + xai_pTile4D superCoeffs[], + const xai_cnn_conv_params *param, + const uint8_t transposeCoeffsFlag) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE4D_F16(inTile); + XAI_CHECK_TILE4D_DATA_ORDER(inTile, XAI_NDWH); + XAI_CHECK_POINTER(param); + XAI_CHECK_POINTER(subCoeffs); + XAI_CHECK_POINTER(superCoeffs); + XAI_CHECK_ERROR(((XAI_CNN_CONV_GET_STRIDEX(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEX(param) <= XAI_TILE4D_GET_DIM3(inTile))) && \ + ((XAI_CNN_CONV_GET_STRIDEY(param) >= 1) && \ + (XAI_CNN_CONV_GET_STRIDEY(param) <= XAI_TILE4D_GET_DIM4(inTile))), XAI_ERR_BADARG, \ + "StrideX = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Width) \ + \nStrideY = %hhu, value must be greater than or equal to 1 and less than or equal to %d(inTile Height)", \ + XAI_CNN_CONV_GET_STRIDEX(param), XAI_TILE4D_GET_DIM3(inTile), \ + XAI_CNN_CONV_GET_STRIDEY(param), XAI_TILE4D_GET_DIM4(inTile)); + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_DILATION(param) == 1), \ + XAI_ERR_BADARG, "\nDilation is %hhu\nDilation parameter should be equal to 1", XAI_CNN_CONV_GET_DILATION(param)); + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATIONX(param) == XAI_CNN_CONV_GET_DILATIONY(param), \ + XAI_ERR_BADARG, "\nDilation along width is %hhu and dilation along height is %hhu are not same", \ + XAI_CNN_CONV_GET_DILATIONX(param), XAI_CNN_CONV_GET_DILATIONY(param)); + } + + int32_t kIdx, kIdy; + int32_t kernelIdx; + + XAI_ERROR_CHECKS_CONTINUE() + { + for (kIdy = 0; kIdy < XAI_CNN_CONV_GET_STRIDEY(param); kIdy++) + { + for (kIdx = 0; kIdx < XAI_CNN_CONV_GET_STRIDEX(param); kIdx++) + { + kernelIdx = kIdy * XAI_CNN_CONV_GET_STRIDEX(param) + kIdx; + XAI_CHECK_TILE4D_F16(subCoeffs[kernelIdx]); + XAI_CHECK_TILE4D_DATA_ORDER(subCoeffs[kernelIdx], XAI_NDWH); + } + XAI_CHECK_TILE4D_F16(superCoeffs[kIdy]); + XAI_CHECK_TILE4D_DATA_ORDER(superCoeffs[kIdy], XAI_NDWH); + } + } + + xb_f16 *pInCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(inTile); + + const int32_t numOutCh = XAI_TILE4D_GET_DIM1(inTile); /* N */ + const int32_t numInCh = XAI_TILE4D_GET_DIM2(inTile); /* D */ + const int32_t kWidth = XAI_TILE4D_GET_DIM3(inTile); /* W */ + const int32_t kHeight = XAI_TILE4D_GET_DIM4(inTile); /* H */ + + const uint8_t strideX = XAI_CNN_CONV_GET_STRIDEX(param); + const uint8_t strideY = XAI_CNN_CONV_GET_STRIDEY(param); + + int32_t inCoeffPitch1 = XAI_TILE4D_GET_DIM1_PITCH(inTile); + int32_t inCoeffPitch2 = XAI_TILE4D_GET_DIM2_PITCH(inTile); + int32_t inCoeffPitch3 = XAI_TILE4D_GET_DIM3_PITCH(inTile); + + int32_t kx, ky, inCh, outCh, inIdx, outIdx = 0; + xb_f16 *pSuperCoeff; + xb_f16 *pSubCoeff; + int32_t subKPitch1, subKPitch2, subKPitch3; + int32_t superKPitch1, superKPitch2; + int32_t kW, kH, subkW; + int32_t numInChSubCoeff; + int32_t subKIdx; + + int32_t kxStart, kyStart; + + if (transposeCoeffsFlag) + { + /* Conversion from NDWH -> DNWH, */ + /* transposing of kernels and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + xb_f16 *pSubCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = kHeight - 1 - ((kHeight + strideY - kIdy - 1) % strideY); + + for (ky = kyStart; ky >= 0; ky -= strideY) /* H */ + { + kxStart = kWidth - 1 - ((kWidth + strideX - kIdx - 1) % strideX); + + for (kx = kxStart; kx >= 0; kx -= strideX) /* W */ + { + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \ + inCh * inCoeffPitch1 + outCh; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + /* For stride alignment */ + outIdx += (outIdx % (XCHAL_IVPN_SIMD_WIDTH)) ? ((XCHAL_IVPN_SIMD_WIDTH) -(outIdx % (XCHAL_IVPN_SIMD_WIDTH))) : 0; + } + } + } + } + } + } + else + { + /* Conversion from NDWH -> DNWH and formation of sub-kernels */ + for (kIdy = 0; kIdy < strideY; kIdy++) + { + for (kIdx = 0; kIdx < strideX; kIdx++) + { + kernelIdx = kIdy * strideX + kIdx; + xb_f16 *pSubCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx]); + + outIdx = 0; + kyStart = ((kHeight + strideY - kIdy - 1) % strideY); + + for (ky = kyStart; ky < kHeight; ky += strideY) /* H */ + { + kxStart = ((kWidth + strideX - kIdx - 1) % strideX); + + for (kx = kxStart; kx < kWidth; kx += strideX) /* W */ + { + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (inCh = 0; inCh < numInCh; inCh++) /* D */ + { + inIdx = ky * inCoeffPitch3 + kx * inCoeffPitch2 + \ + inCh * inCoeffPitch1 + outCh; + pSubCoeff[outIdx++] = pInCoeff[inIdx]; + } + /* For stride alignment */ + outIdx += (outIdx % (XCHAL_IVPN_SIMD_WIDTH)) ? ((XCHAL_IVPN_SIMD_WIDTH) -(outIdx % (XCHAL_IVPN_SIMD_WIDTH))) : 0; + } + } + } + } + } + } + + /* Form super-kernels by stacking sub-kernels */ + for (kernelIdx = 0; kernelIdx < strideY; kernelIdx++) + { + pSuperCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(superCoeffs[kernelIdx]); + + kW = XAI_TILE4D_GET_DIM3(superCoeffs[kernelIdx]); + kH = XAI_TILE4D_GET_DIM4(superCoeffs[kernelIdx]); + + numInChSubCoeff = XAI_TILE4D_GET_DIM1(subCoeffs[kernelIdx * strideX]); + superKPitch1 = XAI_TILE4D_GET_DIM1_PITCH(superCoeffs[kernelIdx]); + superKPitch2 = XAI_TILE4D_GET_DIM2_PITCH(superCoeffs[kernelIdx]); + + for (subKIdx = 0; subKIdx < strideX; subKIdx++) + { + pSubCoeff = (xb_f16 *) XAI_TILE4D_GET_DATA_PTR(subCoeffs[kernelIdx * strideX + subKIdx]); + + subkW = XAI_TILE4D_GET_DIM3(subCoeffs[kernelIdx * strideX + subKIdx]); + + subKPitch1 = XAI_TILE4D_GET_DIM1_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]); + subKPitch2 = XAI_TILE4D_GET_DIM2_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]); + subKPitch3 = XAI_TILE4D_GET_DIM3_PITCH(subCoeffs[kernelIdx * strideX + subKIdx]); + + outIdx = numInChSubCoeff * subKIdx; + + for (ky = 0, kIdy = 0; ky < kH; ky++, kIdy++) /* H */ + { + for (kx = 0, kIdx = 0; kx < kW; kx++, kIdx++) /* W */ + { + /*In case of super kernels we have the first sub kernel width/height as the width/height of the superkernel */ + /*In case the widths of the subkernel are not equal then we skip by differnce and start filling */ + /*Once the convolution is done the output junk data apprears at the end of the outtile. */ + /*In case of unequal heights this is handled using pointers in test app. */ + if ((subkW < kW) && (kx == 0)) + { + outIdx += superKPitch2; + kIdx--; + continue; + } + for (outCh = 0; outCh < numOutCh; outCh++) /* N */ + { + for (inCh = 0; inCh < numInChSubCoeff; inCh++) /* D */ + { + inIdx = kIdy * subKPitch3 + kIdx * subKPitch2 + \ + outCh * subKPitch1 + inCh; + pSuperCoeff[outIdx++] = pSubCoeff[inIdx]; + } + outIdx += (superKPitch1 - numInChSubCoeff); + } + } + } + } + } + return(XAI_ERROR_STATUS()); +} +#endif //if (XCHAL_HAVE_VISION_HP_VFPU == 1) +#endif //if ((XCHAL_VISION_TYPE >= 6)) + diff --git a/backends/cadence/vision/third-party/libxai/include/xai_cnn.h b/backends/cadence/vision/third-party/libxai/include/xai_cnn.h new file mode 100644 index 00000000000..2ba56fe0e98 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/include/xai_cnn.h @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CNN_H__ +#define __XAI_CNN_H__ + +#include "xai_cnn_api.h" +#include "xai_cnn_common.h" +#include "xai_tile_manager.h" +#include "xai_core.h" +#include "limits.h" + +/****************************************************************************/ +/* MACROS : */ +/* Macro for Packing the accumulator output after convolution, scaling it, */ +/* shifting and clamping the final output between min and max limits */ +/****************************************************************************/ +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1, dvecOut2, daccSum, packSA, outScaleDataEven, outScaleDataOdd, outSh, min, max, flag) { \ + xb_vecNx16 m_outEven = IVP_PACKVR2NX24_0(daccSum, packSA); \ + xb_vecNx16 m_outOdd = IVP_PACKVR2NX24_1(daccSum, packSA); \ + xb_vecNx48 m_wvec = IVP_MULUSNX16(outScaleDataEven, m_outEven); \ + m_outEven = IVP_PACKVRNX48(m_wvec, outSh); \ + m_wvec = IVP_MULUSNX16(outScaleDataOdd, m_outOdd); \ + m_outOdd = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outEven = IVP_MAXNX16(IVP_MINNX16(m_outEven, (xb_vecNx16) max), (xb_vecNx16) min); \ + m_outOdd = IVP_MAXNX16(IVP_MINNX16(m_outOdd, (xb_vecNx16) max), (xb_vecNx16) min); \ + xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outOdd), \ + IVP_MOV2NX8_FROMNX16(m_outEven), \ + IVP_SELI_8B_INTERLEAVE_1_EVEN); \ + IVP_DSEL2NX8I(dvecOut2, dvecOut1, IVP_MOV2NX8_FROMNX16(m_outOdd), \ + IVP_MOV2NX8_FROMNX16(m_outEven), \ + IVP_DSELI_INTERLEAVE_1); \ + dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8((xb_vec2Nx8) flag, 1)); \ +} + +#define PACK_SCALE_SHIFT_CLAMP_LIMITS(dvecOut1, dvecOut2, daccSum, packSA, outSc, outSh, min, max, flag) \ + PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ(dvecOut1, dvecOut2, daccSum, packSA, outSc, outSc, outSh, min, max, flag) + +/****************************************************************************/ +/* MACROS : */ +/* Macro for Packing the accumulator output after convolution, scaling it, */ +/* shifting and clamping the final output between min and max limits */ +/****************************************************************************/ +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut, accSum, packSA, outSc, outSh, min, max) { \ + vecOut = IVP_PACKVRNX48(accSum, packSA); \ + xb_vecNx48 m_wvec = IVP_MULUSNX16(outSc, vecOut); \ + xb_vecN_2x32v m_outEven = IVP_PACKVRNX48_0(m_wvec, outSh); \ + xb_vecN_2x32v m_outOdd = IVP_PACKVRNX48_1(m_wvec, outSh); \ + m_outEven = IVP_MAXN_2X32(IVP_MINN_2X32(m_outEven, (xb_vecN_2x32v) max), (xb_vecN_2x32v) min); \ + m_outOdd = IVP_MAXN_2X32(IVP_MINN_2X32(m_outOdd, (xb_vecN_2x32v) max), (xb_vecN_2x32v) min); \ + vecOut = IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32(m_outOdd), \ + IVP_MOVNX16_FROMN_2X32(m_outEven), \ + IVP_SELI_INTERLEAVE_1_EVEN); \ +} + +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_S16(vecOut, accSum, packSA, vecScaleData, outSh, min, max) \ + PACK_SCALE_SHIFT_CLAMP_LIMITS_S16(vecOut, accSum, packSA, vecScaleData, outSh, min, max) + +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_QM32(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outSc, outSh, min, max, flag) { \ + xb_vecNx48 vecSumL = IVP_CVT48SNX32(hvecSumLH, hvecSumLL); \ + xb_vecNx48 vecSumH = IVP_CVT48SNX32(hvecSumHH, hvecSumHL); \ + xb_vecNx16 m_outL = IVP_PACKVRNX48(vecSumL, packSA); \ + xb_vecNx16 m_outH = IVP_PACKVRNX48(vecSumH, packSA); \ + xb_vecNx48 m_wvec = IVP_MULUSNX16((xb_vecNx16U) outSc, m_outL); \ + m_outL = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + m_wvec = IVP_MULUSNX16((xb_vecNx16U) outSc, m_outH); \ + m_outH = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \ + dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL); \ + dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1)); \ + dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH); \ +} + +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_VQ_QM32(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outScaleDataL, outScaleDataH, outSh, min, max, flag) { \ + xb_vecNx48 vecSumL = IVP_CVT48SNX32(hvecSumLH, hvecSumLL); \ + xb_vecNx48 vecSumH = IVP_CVT48SNX32(hvecSumHH, hvecSumHL); \ + xb_vecNx16 m_outL = IVP_PACKVRNX48(vecSumL, packSA); \ + xb_vecNx16 m_outH = IVP_PACKVRNX48(vecSumH, packSA); \ + xb_vecNx48 m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataL, m_outL); \ + m_outL = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataH, m_outH); \ + m_outH = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \ + dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL); \ + dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1)); \ + dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH); \ +} + +/****************************************************************************/ +/* MACROS : */ +/* Macro for Packing the accumulator output after convolution, scaling it, */ +/* shifting and clamping the final output between min and max limits */ +/****************************************************************************/ +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_IXS16(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outScaleDataL, outScaleDataH, outSh, min, max, flag, sel) { \ + xb_vecNx16 hvecSum1, hvecSum2, hvecSum3, hvecSum4; \ + IVP_DSELNX16(hvecSum3, hvecSum1, IVP_MOVNX16_FROMN_2X32(hvecSumLH), IVP_MOVNX16_FROMN_2X32(hvecSumLL), sel); \ + IVP_DSELNX16(hvecSum4, hvecSum2, IVP_MOVNX16_FROMN_2X32(hvecSumHH), IVP_MOVNX16_FROMN_2X32(hvecSumHL), sel); \ + xb_vecNx48 vecSumL = IVP_CVT48SNX32(IVP_MOVN_2X32_FROMNX16(hvecSum2), IVP_MOVN_2X32_FROMNX16(hvecSum1)); \ + xb_vecNx48 vecSumH = IVP_CVT48SNX32(IVP_MOVN_2X32_FROMNX16(hvecSum4), IVP_MOVN_2X32_FROMNX16(hvecSum3)); \ + xb_vecNx16 m_outL = IVP_PACKVRNX48(vecSumL, packSA); \ + xb_vecNx16 m_outH = IVP_PACKVRNX48(vecSumH, packSA); \ + xb_vecNx48 m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataL, m_outL); \ + m_outL = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataH, m_outH); \ + m_outH = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \ + dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL); \ + dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1)); \ + dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH); \ +} + +#define PACK_SCALE_SHIFT_CLAMP_LIMITS_S16S8(dvecOut1, dvecOut2, hvecSumLL, hvecSumLH, hvecSumHL, hvecSumHH, packSA, outScaleDataL, outScaleDataH, outSh, min, max, flag) { \ + xb_vecNx48 vecSumL = IVP_CVT48SNX32(hvecSumLH, hvecSumLL); \ + xb_vecNx48 vecSumH = IVP_CVT48SNX32(hvecSumHH, hvecSumHL); \ + xb_vecNx16 m_outL = IVP_PACKVRNX48(vecSumL, packSA); \ + xb_vecNx16 m_outH = IVP_PACKVRNX48(vecSumH, packSA); \ + xb_vecNx48 m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataL, m_outL); \ + m_outL = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outL = IVP_MAXNX16(IVP_MINNX16(m_outL, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + m_wvec = IVP_MULUSNX16((xb_vecNx16U) outScaleDataH, m_outH); \ + m_outH = IVP_PACKVRNX48(m_wvec, outSh); \ + m_outH = IVP_MAXNX16(IVP_MINNX16(m_outH, (xb_vecNx16) maxLim), (xb_vecNx16) minLim); \ + xb_vec2Nx8 m_dvec = IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(m_outH), IVP_MOV2NX8_FROMNX16(m_outL), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \ + dvecOut1 = IVP_MOV2NX8_FROMNX16(m_outL); \ + dvecOut1 = IVP_MOV2NX8T(dvecOut1, m_dvec, IVP_EQ2NX8(flag, 1)); \ + dvecOut2 = IVP_MOV2NX8_FROMNX16(m_outH); \ +} + +/****************************************************************************/ +/* MACROS : */ +/* Macro for Packing the 24- bit accumulator output to 16-bit */ +/* shifting and clamping the final output between min and max limits */ +/****************************************************************************/ + +#define PACK_SCALE_SHIFT_S24_S16(accdotProd, scale1, accShift1, \ + vecClampL, vecClampH, vecScale1L, vecScale1H, shift1) { \ + xb_vecN_2x32v vecaccHH = IVP_CVT32S2NX24HH(accdotProd); \ + xb_vecN_2x32v vecaccHL = IVP_CVT32S2NX24HL(accdotProd); \ + xb_vecN_2x32v vecaccLH = IVP_CVT32S2NX24LH(accdotProd); \ + xb_vecN_2x32v vecaccLL = IVP_CVT32S2NX24LL(accdotProd); \ + xb_vecN_2x64w haccA, haccB, haccC, haccD; \ + haccA = IVP_MULN_2X16X32_0(scale1, vecaccLL); \ + haccB = IVP_MULN_2X16X32_0(scale1, vecaccLH); \ + haccC = IVP_MULN_2X16X32_0(scale1, vecaccHL); \ + haccD = IVP_MULN_2X16X32_0(scale1, vecaccHH); \ + xb_vecN_2x32v hvec0LL = IVP_PACKVRN_2X64W(haccA, accShift1); \ + xb_vecN_2x32v hvec0LH = IVP_PACKVRN_2X64W(haccB, accShift1); \ + xb_vecN_2x32v hvec0HL = IVP_PACKVRN_2X64W(haccC, accShift1); \ + xb_vecN_2x32v hvec0HH = IVP_PACKVRN_2X64W(haccD, accShift1); \ + xb_vecNx48 accA = IVP_CVT48SNX32(hvec0LH, hvec0LL); \ + xb_vecNx48 accB = IVP_CVT48SNX32(hvec0HH, hvec0HL); \ + vecClampL = IVP_PACKVRNX48(accA, 0); \ + vecClampH = IVP_PACKVRNX48(accB, 0); \ + accdotProd = IVP_CVT24S2NX16(vecClampH, vecClampL); \ + xb_vecNx16U vecScaleLL = IVP_SELNX16UI(0, vecScale1L, IVP_SELI_INTERLEAVE_1_LO); \ + xb_vecNx16U vecScaleLH = IVP_SELNX16UI(0, vecScale1L, IVP_SELI_INTERLEAVE_1_HI); \ + xb_vecNx16U vecScaleHL = IVP_SELNX16UI(0, vecScale1H, IVP_SELI_INTERLEAVE_1_LO); \ + xb_vecNx16U vecScaleHH = IVP_SELNX16UI(0, vecScale1H, IVP_SELI_INTERLEAVE_1_HI); \ + vecaccHH = IVP_CVT32S2NX24HH(accdotProd); \ + vecaccHL = IVP_CVT32S2NX24HL(accdotProd); \ + vecaccLH = IVP_CVT32S2NX24LH(accdotProd); \ + vecaccLL = IVP_CVT32S2NX24LL(accdotProd); \ + haccA = IVP_MULUSN_2X16X32_0(vecScaleLL, vecaccLL); \ + haccB = IVP_MULUSN_2X16X32_0(vecScaleLH, vecaccLH); \ + haccC = IVP_MULUSN_2X16X32_0(vecScaleHL, vecaccHL); \ + haccD = IVP_MULUSN_2X16X32_0(vecScaleHH, vecaccHH); \ + hvec0LL = IVP_PACKVRN_2X64W(haccA, shift1); \ + hvec0LH = IVP_PACKVRN_2X64W(haccB, shift1); \ + hvec0HL = IVP_PACKVRN_2X64W(haccC, shift1); \ + hvec0HH = IVP_PACKVRN_2X64W(haccD, shift1); \ + accA = IVP_CVT48SNX32(hvec0LH, hvec0LL); \ + accB = IVP_CVT48SNX32(hvec0HH, hvec0HL); \ + vecClampL = IVP_PACKVRNX48(accA, 0); \ + vecClampH = IVP_PACKVRNX48(accB, 0); \ +} + +#define PACK_SCALE_SHIFT_S48_S8(accProd, accShift2, scale2L, shift2, vecRescale) { \ + xb_vecN_2x64w wvecAccL = IVP_CVT96UN_2X64(IVP_CVT64SNX48LH(accProd), IVP_CVT64SNX48LL(accProd)); \ + xb_vecN_2x64w wvecAccH = IVP_CVT96UN_2X64(IVP_CVT64SNX48HH(accProd), IVP_CVT64SNX48HL(accProd)); \ + accProd = IVP_CVT48SNX32(IVP_PACKVRN_2X64W(wvecAccH, accShift2), IVP_PACKVRN_2X64W(wvecAccL, accShift2)); \ + vecRescale = IVP_PACKVRNX48(accProd, 0); \ + accProd = IVP_MULUSNX16(scale2L, vecRescale); \ + vecRescale = IVP_PACKVRNX48(accProd, shift2); \ +} + +#define PACK_SCALE_SHIFT_S32_S8(inReg, inReg1, scale2, shift2, seq1, dvecOut) { \ + xb_vecN_2x64w m_wvec = IVP_MULUSN_2X16X32_0((xb_vecNx16U) scale2, inReg); \ + xb_vecN_2x32v m_outL = IVP_PACKVRN_2X64W(m_wvec, shift2); \ + m_outL = IVP_MAXN_2X32(IVP_MINN_2X32(m_outL, SCHAR_MAX), SCHAR_MIN); \ + m_wvec = IVP_MULUSN_2X16X32_1((xb_vecNx16U) scale2, inReg1); \ + xb_vecN_2x32v m_outH = IVP_PACKVRN_2X64W(m_wvec, shift2); \ + m_outH = IVP_MAXN_2X32(IVP_MINN_2X32(m_outH, SCHAR_MAX), SCHAR_MIN); \ + dvecOut = IVP_SEL2NX8(IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outH)), \ + IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outL)), seq1); \ +} + +#define PACK_SCALE_SHIFT_S32_S8(inReg, inReg1, scale2, shift2, seq1, dvecOut) { \ + xb_vecN_2x64w m_wvec = IVP_MULUSN_2X16X32_0((xb_vecNx16U) scale2, inReg); \ + xb_vecN_2x32v m_outL = IVP_PACKVRN_2X64W(m_wvec, shift2); \ + m_outL = IVP_MAXN_2X32(IVP_MINN_2X32(m_outL, SCHAR_MAX), SCHAR_MIN); \ + m_wvec = IVP_MULUSN_2X16X32_1((xb_vecNx16U) scale2, inReg1); \ + xb_vecN_2x32v m_outH = IVP_PACKVRN_2X64W(m_wvec, shift2); \ + m_outH = IVP_MAXN_2X32(IVP_MINN_2X32(m_outH, SCHAR_MAX), SCHAR_MIN); \ + dvecOut = IVP_SEL2NX8(IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outH)), \ + IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(m_outL)), seq1); \ +} + +#define ACC_INIT_BIAS(phvecBias, numBias, daccSum0, daccSum1, daccSum2, daccSum3) { \ + xb_vecN_2x32v hvecBias1, hvecBias2; \ + valign vaBias = IVP_LAN_2X32_PP(phvecBias); \ + IVP_LAVN_2X32_XP(hvecBias1, vaBias, phvecBias, 4 * numBias); \ + IVP_LAVN_2X32_XP(hvecBias2, vaBias, phvecBias, 4 * numBias - 2 * XCHAL_IVPN_SIMD_WIDTH); \ + daccSum0 = IVP_CVT24UNX32L(hvecBias2, hvecBias1); \ + daccSum1 = IVP_CVT24UNX32L(hvecBias2, hvecBias1); \ + daccSum2 = IVP_CVT24UNX32L(hvecBias2, hvecBias1); \ + daccSum3 = IVP_CVT24UNX32L(hvecBias2, hvecBias1); \ + IVP_LAVN_2X32_XP(hvecBias1, vaBias, phvecBias, 4 * numBias - 4 * XCHAL_IVPN_SIMD_WIDTH); \ + IVP_LAVN_2X32_XP(hvecBias2, vaBias, phvecBias, 4 * numBias - 6 * XCHAL_IVPN_SIMD_WIDTH); \ + IVP_CVT24UNX32H(daccSum0, hvecBias2, hvecBias1); \ + IVP_CVT24UNX32H(daccSum1, hvecBias2, hvecBias1); \ + IVP_CVT24UNX32H(daccSum2, hvecBias2, hvecBias1); \ + IVP_CVT24UNX32H(daccSum3, hvecBias2, hvecBias1); \ +} + +#define ACC_INIT_BIAS64_MOD_ONEACC(pdvecBias, vaBias, numBias, accSum64) { \ + xb_vec2Nx8 m_dvecBias1, m_dvecBias2, m_dvecBias3, m_dvecBias4; \ + IVP_LAV2NX8_XP(m_dvecBias1, vaBias, pdvecBias, numBias * 8); \ + IVP_LAV2NX8_XP(m_dvecBias2, vaBias, pdvecBias, numBias * 8 - 2 * XCHAL_IVPN_SIMD_WIDTH); \ + IVP_LAV2NX8_XP(m_dvecBias3, vaBias, pdvecBias, numBias * 8 - 4 * XCHAL_IVPN_SIMD_WIDTH); \ + IVP_LAV2NX8_XP(m_dvecBias4, vaBias, pdvecBias, numBias * 8 - 6 * XCHAL_IVPN_SIMD_WIDTH); \ + accSum64 = IVP_CVT48UN_2X64L(m_dvecBias2, m_dvecBias1); \ + IVP_CVT48UN_2X64H(accSum64, m_dvecBias4, m_dvecBias3); \ +} + +#define ACC_INIT_BIAS64_MOW_ONEACC(pBias64, vaBias, wvecAcc, flag) \ + { \ + xb_vec2Nx8 m_dvecBias64; IVP_LAV2NX8_XP(m_dvecBias64, vaBias, pdvecBias64, flag * 8); \ + m_dvecBias64 = IVP_SHFL2NX8I(m_dvecBias64, IVP_SHFLI_REP_0X4); \ + wvecAcc = IVP_CVT48UN_2X64L(m_dvecBias64, m_dvecBias64); \ + IVP_CVT48UN_2X64H(wvecAcc, m_dvecBias64, m_dvecBias64); \ + } + +#define VQ_INIT_OUTSCALE(pOutScale, numOutScale, vecDataEven, vecDataOdd) { \ + xb_vecNx16U vecDataL, vecDataH; \ + valign vaScale = IVP_LANX16U_PP(pOutScale); \ + IVP_LAVNX16_XP(vecDataL, vaScale, pOutScale, 2 * numOutScale); \ + IVP_LAVNX16_XP(vecDataH, vaScale, pOutScale, 2 * numOutScale - 2 * XCHAL_IVPN_SIMD_WIDTH); \ + vecDataEven = IVP_SELNX16UI(vecDataH, vecDataL, IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0); \ + vecDataOdd = IVP_SELNX16UI(vecDataH, vecDataL, IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1); \ +} +#endif diff --git a/backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h b/backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h new file mode 100644 index 00000000000..07bd4dedbf3 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/include/xai_cnn_api.h @@ -0,0 +1,7041 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CNN_API_H__ +#define __XAI_CNN_API_H__ + +#include "xai_cnn_api_params.h" +#include "xai_config_api.h" +#include "xai_core_api.h" +#include "xai_tile_manager.h" +#include +#include + + +#if ((XCHAL_VISION_TYPE >= 6)) +/***************************************************************************************************/ +/****************************** Fixed Point routines declaration *********************************/ +/***************************************************************************************************/ + +/* Convolution wrappper functions */ +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetConvolve3DVariant(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +/* Convolution MOW*/ +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxNj4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +/* Convolution MOD */ +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_1x1_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +/* Convolution SO */ +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_S8S8IX_SO_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolve3D_S_MxN_U8S8IX_SO_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + xai_cnn_conv_params *param); + +/* Convolution Fully connected */ +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D_S_S8S8IX(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D_S_U8S8IX(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D_S_S16S16I16(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_S8S8IXCa2(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_U8S8IXCa2(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_S16S16I16Ca2(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_S8S8IXCa2_QM32(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnected3D_S_U8S8IXCa2_QM32(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S8S8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U8S8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S8U8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U8U8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U8S8IXCa2_NoBU(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S16S16I16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_U16S16I16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_S16U16I16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D2_S_S8(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D2_S_U8S8U8(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3D2_S_U8(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution wrapper function */ +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetConvolved3DVariant(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution MOW, dilation = 1 */ +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj2d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj4d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution MOW, dilation = 2*/ +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution MOW, dilation = 4 */ +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution MOD*/ +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_4x4_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Partial convolution */ +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* MOD_DWH S16S16 Partial Convolution variant */ +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution SO*/ +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IX_SO_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IX_SO_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Depthwise Convolution wrappper function */ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolve2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Depthwise Convolutions MOW */ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Depthwise MOW Convolution MOW 16-bit Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj4_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); +/* Depthwise MOW Convolution VQ variants*/ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7j4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Depthwise MOW Convolution MOW 16-bit Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj2_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxNj4_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Depthwise Convolutions MOD */ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); +/* Depthwise MOD16 Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D pinTile, + const xai_pTile3D pcoeffTile, + const xai_pArray pbiasArray, + xai_pTile3D poutTile, + const xai_cnn_conv_params *pconvParam); + +/* Depthwise MOD VQ Convolution variants */ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_5x5_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); +/* Depthwise MOD16 Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D pinTile, + const xai_pTile3D pcoeffTile, + const xai_pArray pbiasArray, + const xai_pArray poutputScaleArray, + xai_pTile3D poutTile, + const xai_cnn_conv_params *pconvParam); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolveVQ2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolveVQ2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolvedVQ2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + +/*Depthwise dilated wrapper function*/ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetDepthwiseConvolved2DVariant(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +/*_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + */ +/*Depthwise dilated MOW convolution variants*/ +/*_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + */ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +/* + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + */ +/*Depthwise Dilated MOD convolution variants*/ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_7x7_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_5x5_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_3x3_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +/* + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + + _XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + */ + + +/* VQ variants */ +/*Depthwise Dilated MOD convolution variants*/ + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_7x7_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_3x3_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolvedVQ2D_S_5x5_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +/* Depthwise DM MOD convolve */ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolved2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolved2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolved2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +/* Depthwise DM MOD convole VQ */ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedVQ2D_S_MxN_S8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedVQ2D_S_MxN_U8S8IX_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedVQ2D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +/*_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDMConvolvedReorderCoeff2D_MOD(const xai_pTile3D srcTile, const xai_pArray biasArray, + xai_pTile3D dstTile, xai_pArray biasArrayReOrder, + const int32_t inDepth, const int32_t depthMultiplier); + */ + +/* VQ variants */ + +/* Dilated convolution wrapper */ + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE *xaiGetConvolvedVQ3DVariant(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated MOD VQ Convolution variants */ +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution MOW, dilation = 1 */ + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + + + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj2d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj2d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj2d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj4d1_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj4d1_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj4d1_S16S16I16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolutions MOW, dilation = 2*/ + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d2_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d2_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolutions MOW, dilation = 4 */ + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7j1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d4_S8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxNj1d4_U8S8IX_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* Dilated Convolution MOD_DWH - VQ variants */ + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_2x2_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_4x4_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_5x5_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_7x7_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_1x1_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/*Bias update function*/ +_XAI_API_ XAI_ERR_TYPE xaiConvolvedBiasUpdate_S8S32(const xai_pTile4D coeffTile, + xai_pArray biasArray); + +/* Reorder a 4D tile to IN64DWH format */ +_XAI_API_ XAI_ERR_TYPE xaiReOrder4DToIN64DWH_I8(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut); + +/* Reorder a 4D tile to IN32DWH format */ +_XAI_API_ XAI_ERR_TYPE xaiReOrder4DToIN32DWH_I16(xai_pTile4D coeffTileIn, xai_pTile4D coeffTileOut); + +/* Partial convolution */ +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_S8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_S_MxN_U8S8IXCa2_noUnrollH_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S8S8IXCa2_MOD_DWH_QM32(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* MOD_DWH S16S16 Partial Convolution VQ variant */ +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolvedVQ3D_S_MxN_S16S16I16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/* MxN SO VQ variants */ + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_S8S8IX_SO_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolvedVQ3D_S_MxN_U8S8IX_SO_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + + +/* MxN Fully Connected VQ variants */ + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D_S_S8S8IX(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D_S_U8S8IX(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3D_S_S16S16I16(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_S8S8IXCa2(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_U8S8IXCa2(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_S16S16I16Ca2(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_S8S8IXCa2_QM32(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiPartialFullyConnectedVQ3D_S_U8S8IXCa2_QM32(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + const xai_pArray outputScaleArray, + xai_pTile3D accTile, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S8S8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U8S8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S8U8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U8U8IXCa2(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U8S8IXCa2_NoBU(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S16S16I16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_U16S16I16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedVQ3DWithBatching_S_S16U16I16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + const xai_pArray outputScaleArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +/* Max Pool */ +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj1_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj1_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj1_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj2_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj2_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxNj2_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +/* MaxPoolWithIdx Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj1_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj2_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj1_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj2_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj1_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxNj2_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + + +/* MaxUnPool Variants */ + + +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); +/* + _XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj1_S8_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + */ +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_S8_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_S8_DWH(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); +/* + _XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj1_U8_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + */ +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_U8_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_U8_DWH(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +/* + _XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj1_S16_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + */ +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_S16_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxNj2_F16_WHD(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_S16_DWH(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxUnPool3D_MxN_F16_DWH(const xai_pTile3D inTile, + const xai_pTile3D idxTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); +/* RoI Max Pool Variants */ +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D_U8_DWH(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPool3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + xai_pTile4D idxTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D_U8_DWH(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + xai_pTile4D idxTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + xai_pTile4D idxTile, + const xai_cnn_roi_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiRoiMaxPoolWithIdx3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray RoIParam, + xai_pTile4D outTile, + xai_pTile4D idxTile, + const xai_cnn_roi_pooling_params *param); + +/* Average Pool */ +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S8_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_U8_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S16_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S8U8_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_S8S16_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_U8S8_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj1_U8S16_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S8U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_S8S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_U8S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxNj2_U8S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_S8S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_U8S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +/* Global Average Pool */ +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8U8_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8S16_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S8_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S16_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S16_WHD(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8U8_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S8S16_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S8_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_U8S16_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +_XAI_API_ XAI_ERR_TYPE xaiGlobalAvgPool3D_S16_DWH(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_global_pooling_params* param); + +/* Average pooling CNNA Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_U8S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_U8S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S8U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_S8S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +/*Adaptive Average Pool*/ +_XAI_API_ XAI_ERR_TYPE xaiAdaptiveAvgPool3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray inTileIndexArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiAdaptiveAvgPool3D_IX(const xai_pTile3D inTile, + const xai_pArray inTileIndexArray, + xai_pTile3D outTile); + +/*Adaptive MaxPool*/ +_XAI_API_ XAI_ERR_TYPE xaiAdaptiveMaxPool3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray inTileIndexArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiAdaptiveMaxPool3D_IX(const xai_pTile3D inTile, + const xai_pArray inTileIndexArray, + xai_pTile3D outTile); + +/* LRN */ +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_U8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_U8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_U8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_U8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_U8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_U8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_3_S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_5_S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_S_N_S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_depth_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_S8_WHD(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_3x3_S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_5x5_S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_S_MxN_S8_DWH(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lrn_spatial_params *param); + +/* LUT APIs */ + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S8S8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S8I8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S8I8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S8S16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S8I16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S8I16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S16S8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S16I8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S16I8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_S16S16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_S16I16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_S16I16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +/* Partial Dual LUT APIs */ + +_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_S16I16(const xai_pTile3D inTile, + const xai_pArray lut1Array, + const xai_pArray lut2Array, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_Oddsym_S16S16(const xai_pTile3D inTile, + const xai_pArray lut1Array, + const xai_pArray lut2Array, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_Evensym_S16I16(const xai_pTile3D inTile, + const xai_pArray lut1Array, + const xai_pArray lut2Array, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPartialDualLUT3D_Normal_S16I16(const xai_pTile3D inTile, + const xai_pArray lut1Array, + const xai_pArray lut2Array, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +/* FillTile */ +_XAI_API_ XAI_ERR_TYPE xaiFillTile3D(xai_pTile3D dstTile, + const int32_t value, + xai_bool fillEdgeExtension); + +_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_I8(xai_pTile3D dstTile, + const int32_t value, + xai_bool fill_edge_extension); + +_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_I16(xai_pTile3D dstTile, + const int32_t value, + xai_bool fill_edge_extension); + +/* Extend Edge */ +_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D(xai_pTile3D dstTile, + const int32_t value, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_I8(xai_pTile3D dstTile, + const int32_t value, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_I16(xai_pTile3D dstTile, + const int32_t value, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_I8(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_I16(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize); + +/* Copy Tile */ +_XAI_API_ XAI_ERR_TYPE xaiCopyTile3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_bool copy_edge_extension); + +/* Transpose */ +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_WHD_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_WHD_DWH_Depth3(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_DWH_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I8_DWH_WHD_Depth3(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I16_WHD_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I16_DWH_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I32_WHD_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D_I32_DWH_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile); +/* + _XAI_API_ XAI_ERR_TYPE xaiTranspose_I32(const xai_pArray srcArray, + xai_pArray dstArray); + */ +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I8_DWH_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I16_DWH_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I32_DWH_WHD(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I8_WHD_DWH(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I16_WHD_DWH(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTranspose3D2_I32_WHD_DWH(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTile); + +/* Unsigned to Signed */ +_XAI_API_ XAI_ERR_TYPE xaiUnsignedToSigned3D_U8S8(xai_pTile3D inTile, + xai_pTile3D outTile); + +/* Data Conversions */ +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S32U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16I8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16I8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATIX(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATS8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATS16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATU16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_FLOATU8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_IXFLOAT(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S8FLOAT(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_S16FLOAT(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16FLOAT(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U8FLOAT(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale); + +/* Data Conversions with Asymmetric Quantization */ +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroPoint, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t fixUp, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t fixUp, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t fixUp, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t fixUp, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroIn, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8I64(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroIn, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S16S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_U16S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift); + +// Temporary prototype definition, to be removed later +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_U16AS8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S32S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t zeroOut, + const uint16_t scale, + const uint8_t shift); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_S8FLOAT(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale, + const int16_t zeroPoint); + +_XAI_API_ XAI_ERR_TYPE xaiDataConversion3D_AsymQ_FLOATS8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const float scale, + const int16_t zeroPoint); +/* ReOrg */ +_XAI_API_ XAI_ERR_TYPE xaiReOrg3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg3D_I16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + +/* ReOrg4D */ +_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reorg4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I8_WHDN(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reorg4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I16_WHDN(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reorg4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I8_DWHN(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reorg4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReOrg4DBatchSpace_I16_DWHN(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reorg4D_params *params); + +/* ReOrg Caffe*/ +/*_XAI_API_ XAI_ERR_TYPE xaiReOrgCaffe3D_I8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + _XAI_API_ XAI_ERR_TYPE xaiReOrgCaffe3D_I16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reorg_params *params); + */ +/* Renormalisation */ +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t renormScale, + const uint8_t renormShift); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t renormScale, + const uint8_t renormShift); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t renormScale, + const uint8_t renormShift); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t renormScale, + const uint8_t renormShift); + +_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_S16_WHD(const xai_pTile3D inTile, + const xai_pArray scaleArray, + xai_pTile3D outTile, + const uint8_t renormShift); + + +_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray scaleArray, + xai_pTile3D outTile, + const uint8_t renormShift); +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D_U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t renormScale, + const uint8_t renormShift); + +_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_U16_WHD(const xai_pTile3D inTile, + const xai_pArray scaleArray, + xai_pTile3D outTile, + const uint8_t renormShift); + + +_XAI_API_ XAI_ERR_TYPE xaiRenormVQ3D_U16_DWH(const xai_pTile3D inTile, + const xai_pArray scaleArray, + xai_pTile3D outTile, + const uint8_t renormShift); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_AsymQ_S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_AsymQ_U8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiRenorm3D2_AsymQ_S8U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_renorm_params *params); +/* Interp Variants */ + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_U8S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_S8S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_SetTileParams(const xai_size3D *inFrame3DSize, + const xai_size3D *outFrame3DSize, + const xai_cnn_data_order dataOrder, + int32_t half_pixel_flag, + xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_SetTileParams(const xai_size3D *inFrame3DSize, + const xai_size3D *outFrame3DSize, + const xai_cnn_data_order dataOrder, + xai_cnn_resize_nearest3D_params *params); + +/* ResizeNearest variants */ + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8U8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_S8U8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +/* RELU */ +_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU(const xai_pTile3D inTile, + xai_pTile3D outTile, + const XAI_Q15 slope); + +_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const XAI_Q15 slope); + +_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const XAI_Q15 slope); + +_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_S16S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const XAI_Q15 slope); + + +_XAI_API_ XAI_ERR_TYPE xaiRELU(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint8_t minVal, + const uint8_t maxVal); + +_XAI_API_ XAI_ERR_TYPE xaiRELU_U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint8_t minVal, + const uint8_t maxVal); + +_XAI_API_ XAI_ERR_TYPE xaiRELU_S8U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint8_t minVal, + const uint8_t maxVal); + +/* PRELU*/ +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S8_WHD(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile, + const uint8_t outputShift); + +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S16_WHD(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile, + const uint8_t outputShift); + +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S8_DWH(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile, + const uint8_t outputShift); + +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_S16_DWH(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile, + const uint8_t outputShift); + +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile, + const uint8_t outputShift); + +_XAI_API_ XAI_ERR_TYPE xaiRELUScale(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t scale, + const uint8_t shift, + const int8_t offset, + const uint8_t minVal, + const uint8_t maxVal); + +_XAI_API_ XAI_ERR_TYPE xaiRELUScale_S8U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int16_t scale, + const uint8_t shift, + const int8_t offset, + const uint8_t minVal, + const uint8_t maxVal); + +_XAI_API_ XAI_ERR_TYPE xaiRELU16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int32_t minVal, + const int32_t maxVal); + +_XAI_API_ XAI_ERR_TYPE xaiRELU16_S16I16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int32_t minVal, + const int32_t maxVal); + +_XAI_API_ XAI_ERR_TYPE xaiRELU16_U16I16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const int32_t minVal, + const int32_t maxVal); + +/* Modified Relu for BN + Depthwise Clip operation */ +_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip(const xai_pTile3D inTile, + const xai_pArray thresholdMax, + const xai_pArray thresholdMin, + xai_pTile3D outTile, + const xai_cnn_relu_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S8_DWH(const xai_pTile3D inTile, + const xai_pArray thresholdMax, + const xai_pArray thresholdMin, + xai_pTile3D outTile, + const xai_cnn_relu_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S8_WHD(const xai_pTile3D inTile, + const xai_pArray thresholdMax, + const xai_pArray thresholdMin, + xai_pTile3D outTile, + const xai_cnn_relu_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S16_DWH(const xai_pTile3D inTile, + const xai_pArray thresholdMax, + const xai_pArray thresholdMin, + xai_pTile3D outTile, + const xai_cnn_relu_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiChannelwiseClip_S16_WHD(const xai_pTile3D inTile, + const xai_pArray thresholdMax, + const xai_pArray thresholdMin, + xai_pTile3D outTile, + const xai_cnn_relu_params *params); + +/* Batchnorm */ + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8_WHD(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8_WHD(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8S16_WHD(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S16_WHD(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S16_WHD(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8_DWH(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S16_DWH(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8S16_DWH(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S16_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8U8_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_S8S16_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S8_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_U8S16_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_Dim2(const xai_pTile3D inTile, + const xai_pArray alphaArray, + const xai_pArray betaArray, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +/* ArgMax */ +_XAI_API_ XAI_ERR_TYPE xaiArgmax_S8(const xai_pTile3D inTile, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + xai_pTile2D extraValCnt, + xai_pArray sortedIdxArr, + xai_pArray sortedValArr, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin_S8(const xai_pTile3D inTile, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + xai_pTile2D extraValCnt, + xai_pArray sortedIdxArr, + xai_pArray sortedValArr, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S8_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S8_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U8_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U8_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S16_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S16_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U16_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U16_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S8_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S8_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U8_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U8_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S16_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S16_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U16_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U16_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S8_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S8_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U8_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U8_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_S16_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_S16_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_U16_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_U16_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); +/*argmax merger variants*/ +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_S8_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_U8_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_S16_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_U16_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); +/* SoftMax Variants */ + +/* 1D variant */ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax_S16U16(const xai_pArray input, + const xai_pArray lutArray, + xai_pArray output, + const xai_cnn_softmax_params *params); + +/* 3D variant */ + +_XAI_API_ XAI_ERR_TYPE xaiCalcMaxval3D_S8(const xai_pTile3D inTile, + xai_pArray maxValArr, + xai_cnn_maxval_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcMaxval3D_S16(const xai_pTile3D inTile, + xai_pArray maxValArr, + xai_cnn_maxval_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim1(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim1(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8_Dim1(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8_Dim2(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8_Dim3(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8AS8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim1(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim2(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim3(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim3(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim3(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Faster performing immplementation of Softmax3D along Dim1*/ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim1_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Faster performing immplementation of Softmax3D along Dim2*/ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim2_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Faster performing immplementation of Softmax3D along Dim3*/ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim3_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim1_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Faster performing immplementation of Softmax3D along Dim1*/ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim1_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Faster performing immplementation of Softmax3D along Dim2*/ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim2_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Faster performing immplementation of Softmax3D along Dim3*/ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S16U16_Dim3_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Input 3D MxN */ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Mclasses(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Mclasses_S16U16(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Mclasses_S8U8(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +/* Input 3D NxM */ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Ndata(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Ndata_S8U8(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim2(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim2(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Dim2(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim2_fast(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_S8U8_Dim3_fast(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +/* Input 3D NxM */ + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim1(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim2(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim3(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_Ndata_S16U16(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim1(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray bufArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim1(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim2(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim3(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim3(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim1_fast(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim2_fast(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Dim3_fast(const xai_pTile3D input, + const xai_pArray lutArray, xai_pArray buffArray, xai_pTile3D output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim1_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim1_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim2_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim2_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S8U8_Dim3_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_S16U16_Dim3_fast(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +/* Input 3D MxN */ + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Mclasses_S16U16(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +/* Input 3D NxM */ + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax3D_gMax_Ndata_S16U16(const xai_pTile3D input, + const xai_pArray lutArray, + xai_pTile3D output, + const xai_cnn_softmax_params *params); + +/* Softmax 8-bit variant */ +/* 1D variant */ +_XAI_API_ XAI_ERR_TYPE xaiSoftmax_S8U8(const xai_pArray input, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pArray output, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax(const xai_pArray input, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pArray output, + const xai_cnn_softmax_params *params); + + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8U8_Dim1(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8AS8_Dim1(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S16U16_Dim1(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8U8_Dim2(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8AS8_Dim2(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S16U16_Dim2(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8U8_Dim3(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S8AS8_Dim3(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D_S16U16_Dim3(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiMaskedSoftmax3D(const xai_pTile3D inTile, + const xai_pTile3D maskTile, + const xai_pArray lutArray, + xai_pArray buffArray, + xai_pTile3D outTile, + const xai_cnn_softmax_params *params); + + +/*Sigmoid3D functions*/ + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S8U8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S8AS8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16U8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16S8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16U16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_S16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +/*Tanh3D hyperbolic functions*/ + + +_XAI_API_ XAI_ERR_TYPE xaiTanh3D(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiTanh3D_S8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiTanh3D_S16S8(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + +_XAI_API_ XAI_ERR_TYPE xaiTanh3D_S16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const int16_t shift, + const int16_t scale); + + +/* Eltwise Add */ +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_S8I8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_U8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_S8U8I8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1_S16I16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +/* Eltwise Add j2 variants */ + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j2_S8I8_DWH(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j2_U8_DWH(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j2_S16I16_DWH(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +/* Eltwise Add j1j2 variants */ + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1j2_S8I8_DWH(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1j2_U8_DWH(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_j1j2_S16I16_DWH(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +/* Eltwise Subtraction */ + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_j1_S8I8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_j1_S16I16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *param); + +/* Eltwise Multiply */ +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8S16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8U8S8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8U8U8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S8U8S16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_U8I8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_U8S16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_U16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params *param); + +/* Eltwise Exponent */ + +_XAI_API_ XAI_ERR_TYPE xaiExp3D(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_exponent_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiExp3D_S16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_exponent_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiExp3D_S16U16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_exponent_params *params); + + +/* Maxout */ +_XAI_API_ XAI_ERR_TYPE xaiMaxout3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t kSize); + +_XAI_API_ XAI_ERR_TYPE xaiMaxout3D_S8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t kSize); + +_XAI_API_ XAI_ERR_TYPE xaiMaxout3D_S8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint16_t kSize); + +/* Mean subtraction */ +_XAI_API_ XAI_ERR_TYPE xaiMeanSubtraction3D_U8S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const uint8_t mean, + const uint16_t scale, + const uint8_t shift); +/* Generate LUT for LRN */ +_XAI_API_ XAI_ERR_TYPE xaiLRNSpatial3D_generateLut(xai_pArray lutArray, + xai_cnn_lrn_spatial_params *params, + float alpha, + float beta, + float kValue, + int32_t maxSumOfSquares, + float qIn, + float qOut); + +_XAI_API_ XAI_ERR_TYPE xaiLRNDepth3D_generateLut(xai_pArray lutArray, + xai_cnn_lrn_depth_params *params, + float alpha, + float beta, + float kValue, + int32_t maxSumOfSquares, + float qIn, + float qOut); + +/* Generate LUT*/ +_XAI_API_ XAI_ERR_TYPE xaiTanh_generateLut(xai_pArray lutArray, + const int32_t inpDataType, + const uint8_t lutQfactor, + const float qIn); + +_XAI_API_ XAI_ERR_TYPE xaiTanh3D_generateLut(const xai_pTile3D inTile, + xai_pArray lutArray, + const uint16_t tanh_cutoff); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid3D_generateLut(const xai_pTile3D inTile, + xai_pArray lutArray, + const uint16_t sigmoidCutoff); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoid_generateLut(xai_pArray lutArray, + const int32_t inpDataType, + const uint8_t lutQfactor, + const float qIn); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax_generateLut_S16(xai_pArray lutArray, + xai_cnn_softmax_params *params, + const uint16_t qFactorLUT, + const float qIn); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax_generateLut_S8(xai_pArray lutArray, + xai_cnn_softmax_params *params, + const uint16_t qFactorLUT, + const float qIn + ); + +_XAI_API_ XAI_ERR_TYPE xaiSoftmax_generateLut(xai_pArray lutArray, + const xai_pTile3D input, xai_cnn_softmax_params *params, + const uint16_t qFactorLUT, + const float qIn + ); +_XAI_API_ XAI_ERR_TYPE xaiExp_generateLUT(float inScaleF, int inQPDepth, float outScaleF, + int outQPDepth, xai_pArray tables, + xai_cnn_exponent_params *params, const xai_pArray qXBits, + const xai_pArray qYBits); + +_XAI_API_ XAI_ERR_TYPE xaiStdDevRecip_generateLut(xai_pArray rSqrtTable, + const xai_dataType dataType); + +/* Wrappper Functions */ +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_1(const xai_pTile3D inTile, + xai_pTile3D outTile, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_2(const xai_pTile3D inTile, + xai_pTile3D outTile, + int32_t value, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_3(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pArray pArray, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_4(const xai_pTile3D inTile0, + const xai_pTile3D inTile1, + xai_pTile3D outTile, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_5(const xai_pTile3D inTile0, + const xai_pTile3D inTile1, + xai_pTile3D outTile, + int32_t value, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_6(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile2D tmpTile, + int32_t value, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_7(const xai_pTile3D inTile, + int32_t *counter, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_8(const xai_pTile3D inTile, + int32_t value, + int32_t *counter, + void *function2DPtr); + +_XAI_API_ XAI_ERR_TYPE xaiWrapper3D_TYPE_9(const xai_pTile3D inTile, + xai_pTile3D outTile, + XAI_Q13_18 xscale, + XAI_Q13_18 yscale, + XAI_Q13_18 xshift, + XAI_Q13_18 yshift, + void *function2DPtr); + + +_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim4D_WHDN(const xai_pTile4D coeffTile, + xai_pTile4D subCoeffInfo[], + uint16_t *numSubKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim3D_WHD(const xai_pTile3D coeffTile, + xai_pTile3D subCoeffInfo[], + uint16_t *numSubKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder4D_I8_WHDN(const xai_pTile4D inTile, + xai_pTile4D subCoeffs[], + const xai_cnn_conv_params *param, + const uint8_t transposeCoeffsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder3D_I8_WHD(const xai_pTile3D inTile, + xai_pTile3D subCoeffs[], + const xai_cnn_depthwiseDilatedConv_params *param, + const uint8_t transposeCoeffsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[], + xai_pTile3D outTile, + const xai_cnn_conv_params *convParams); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseDeConvInterleave3D_I8_WHD(const xai_pTile3D inTile[], + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *convParams); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvInterleave3D_I16_WHD(const xai_pTile3D inTile[], + xai_pTile3D outTile, + const xai_cnn_conv_params *convParams); + +_XAI_API_ XAI_ERR_TYPE xaiBiasExtend_S32_MOD(const xai_pArray inBiasArray, + xai_pArray outBiasArray); + +_XAI_API_ XAI_ERR_TYPE xaiOutScaleExtend_U16_MOD(const xai_pArray outScaleArray, + xai_pArray extendedOutScaleArray); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim4D_NDWH(const xai_pTile4D coeffTile, + xai_pTile4D subCoeffInfo[], + xai_pTile4D superCoeffInfo[], + uint16_t *numSubKernels, + uint16_t *numSuperKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvGetDim3D_DWH(const xai_pTile3D coeffTile, + xai_pTile3D subCoeffInfo[], + uint16_t *numSubKernels, + const uint8_t strideX, + const uint8_t strideY, + const uint8_t getNumKernelsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder4D_I8_NDWH(const xai_pTile4D inTile, + xai_pTile4D subCoeffs[], + xai_pTile4D superCoeffs[], + const xai_cnn_conv_params *param, + const uint8_t transposeCoeffsFlag); + +_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder3D_I8_DWH(const xai_pTile3D inTile, + xai_pTile3D subCoeffs[], + const xai_cnn_depthwiseDilatedConv_params *param, + const uint8_t transposeCoeffsFlag); + +/*Permute Functions*/ + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D_I8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D_I16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D_I32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_permute4D_params* params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D2(const xai_pTile4D inTile, + xai_pArray bufArray, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D2_I8(const xai_pTile4D inTile, + xai_pArray bufArray, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D2_I16(const xai_pTile4D inTile, + xai_pArray bufArray, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPermute4D2_I32(const xai_pTile4D inTile, + xai_pArray bufArray, + xai_pTile4D outTile, + const xai_cnn_permute4D_params *params); + +/*Shuffle variants*/ + +_XAI_API_ XAI_ERR_TYPE xaiShuffle3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_shuffle3D_params *shuffParams); + +_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I8_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_shuffle3D_params *shuffParams); + +_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_shuffle3D_params *shuffParams); + + +_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I8_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_shuffle3D_params *shuffParams); + + +_XAI_API_ XAI_ERR_TYPE xaiShuffle3D_I16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_shuffle3D_params *shuffParams); + + +/* Calc Normalize Wrapper Function */ +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_I8(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + const xai_pArray recipTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S16(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +/* Calc Normalize Variants*/ +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S8_WHD(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + const xai_pArray recipTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_U8_WHD(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + const xai_pArray recipTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S16_WHD(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S8_DWH(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + const xai_pArray recipTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_U8_DWH(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + const xai_pArray recipTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_S16_DWH(const xai_pTile3D pInTile, + const xai_pArray rSqrtTable, + xai_pArray buffArrSoS, + xai_pArray buffNSAShiftArray, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params *params); + +/* Apply Scale Wrapper Function */ +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_I8(const xai_pTile3D InTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D pOutTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S16(const xai_pTile3D InTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D pOutTile, + const xai_cnn_normalize3D_params *params); + +/* Apply Scale Variants */ +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S8_WHD(const xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D outTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_U8_WHD(const xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D outTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S16_WHD(const xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D outTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D outTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_U8_DWH(const xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D outTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + const xai_pArray pQuantScaleTable, + const xai_pArray buffNSAShiftArray, + xai_pTile3D outTile, + const xai_cnn_normalize3D_params *params); + +/*Generate LUT for normalize variants*/ +_XAI_API_ XAI_ERR_TYPE xaiNormalize3D_generateLut(xai_pArray rSqrtTable, + xai_pArray recipTable, + const xai_cnn_normalize3D_params *params, + const xai_dataType dataType); + +_XAI_API_ XAI_ERR_TYPE xaiNormalize3D_generateLut_S16(xai_pArray rSqrtTable, + const xai_cnn_normalize3D_params *params, + const xai_dataType dataType); + +/* Instance Normalization API ref */ + +/* calcInstanceNorm APIs */ + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S8_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_U8_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_S16_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_pArray rSqrtTable, + const xai_cnn_instance_norm_param *params); + +/* applyInstanceNorm APIs */ + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8U8_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S8S16_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S8_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_U8S16_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_S16_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + + +/*Wrapper function for xaiApplyInstanceNorm3D_Dim*/ + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param * params); + +/*Channelwise Divide variants*/ +_XAI_API_ XAI_ERR_TYPE xaiDivide3D(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S8_WHD(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8_WHD(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8S8_WHD(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16_WHD(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16S8_WHD(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8_DWH(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_U8S8_DWH(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDivide3D_S16S8_DWH(const xai_pTile3D inTile, + const xai_pArray channelDivisor, + xai_pTile3D outTile, + const xai_cnn_divide3D_params *params); + +/* Crop and Resize variants */ + +_XAI_API_ XAI_ERR_TYPE xaiCropResize3D(const xai_pTile3D inTile, + const xai_pArray ROIinfo, + xai_pTile4D outTile, + const xai_cnn_cropResize3D_params *params); + + +_XAI_API_ XAI_ERR_TYPE xaiCropResize3D_S8_DWH(const xai_pTile3D inTile, + const xai_pArray ROIinfo, + xai_pTile4D outTile, + const xai_cnn_cropResize3D_params *params); + + +_XAI_API_ XAI_ERR_TYPE xaiCropResize3D_U8_DWH(const xai_pTile3D inTile, + const xai_pArray ROIinfo, + xai_pTile4D outTile, + const xai_cnn_cropResize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCropResize3D_S16_DWH(const xai_pTile3D inTile, + const xai_pArray ROIinfo, + xai_pTile4D outTile, + const xai_cnn_cropResize3D_params *params); + +/* ReduceSum3D variants */ +// ----------------------------------------------------------------------------------- +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S8(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S8U8(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S8S16(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U8(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U8S8(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U8S16(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S16(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U16(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_S32(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum3D_U32(const xai_pTile3D inTile, + xai_pArray bufferArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); +// ----------------------------------------------------------------------------------- +/* ReduceSum4D variants */ +// ----------------------------------------------------------------------------------- +#ifndef GLOW_BUILD +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); +#endif +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S8(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S8U8(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S8S16(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U8(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U8S8(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U8S16(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S16(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U16(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_S32(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSum4D_U32(const xai_pTile4D inTile, + xai_pArray bufferArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); +// ----------------------------------------------------------------------------------- +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_U16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_S32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_S32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax3D_U32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin3D_U32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); +// ----------------------------------------------------------------------------------- +#ifndef GLOW_BUILD +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); +#endif +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_U8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_S8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_U8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_S8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_S16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_S16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_U16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_U16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_S32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_S32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMax4D_U32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMin4D_U32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); +/* ReduceSAD3D variants */ +_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pArray buffArr, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_S16UX(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pArray buffArr, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_S8U16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pArray buffArr, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_S8U8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pArray buffArr, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_U8U16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pArray buffArr, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSAD3D_U8(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pArray buffArr, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +//SVDF function +_XAI_API_ XAI_ERR_TYPE svdf_S8I8(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasTile, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +_XAI_API_ XAI_ERR_TYPE svdf_U8I8(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasTile, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +_XAI_API_ XAI_ERR_TYPE xaiSvdf_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasTile, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +//SVDF function +_XAI_API_ XAI_ERR_TYPE svdfAligned(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasTile, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +//SVDF function +_XAI_API_ XAI_ERR_TYPE xaiSvdf_S8U8_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasArray, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +_XAI_API_ XAI_ERR_TYPE xaiSvdf_U8U8_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasArray, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +//SVDF function +_XAI_API_ XAI_ERR_TYPE xaiSvdf_AS8AS8_VQ(const xai_pTile3D inTile, xai_pTile3D stateTile, + const xai_pTile4D betaTile, const xai_pTile4D alphaTile, + xai_pTile3D scratchTile, const xai_pArray biasArray, + xai_pTile3D outTile, xai_cnn_svdf_params *svdfParams, + const xai_pArray outputScaleArray1, + const xai_pArray outputScaleArray2, + const xai_pArray fixUpBiasBuf); + +/*************************************************************************************************/ +/* Quantize3D/4D (FP32 to fixed point) is declared in Fixed Point routines declaration */ +/* as it can be used for the non AO or non FP32 support Hardwares via REF code inside the opt */ +/*************************************************************************************************/ + + +_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32U8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32S8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32S16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +/*************************************************************************************************/ +/************************** END of Fixed Point routines declaration ***************************/ + +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +/***************************************************************************************************/ +/****************************** FP16 routines declaration ****************************************/ +/***************************************************************************************************/ + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastAddA3D_F16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastSub3D_F16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwise_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastEltwiseEqualA3D_F16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastEltwiseNotEqualA3D_F16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastMulA3D_F16(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + const xai_cnn_eltwiseMul_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D_F16(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D2_F16(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D2_F16_FOLD8(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnectedA3D2_F16_FOLD16(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiFullyConnected3DWithBatching_S_F16(const xai_pTile4D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pArray accArray, + xai_pTile4D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_MxNj1d1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_1x1j1d1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_2x2j1d1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j1d1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_S_3x3j2d1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_MXN_F16Ca2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_1X1_F16Ca2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiConvolved3D_1X1_F16Ca2_MOD_WHD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiPartialConvolved3D_MXN_F16Ca2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile4D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxN_F16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolved2D_S_MxNj1d2_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_depthwiseDilatedConv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLOGA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiExp3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMaxA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); +_XAI_API_ XAI_ERR_TYPE xaiReduceMinA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMaxA4D_F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMinA4D_F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSumA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceSumA4D_F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMeanA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceMeanA4D_F16(const xai_pTile4D inTile, + xai_pArray intermediateArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceProdA3D_F16(const xai_pTile3D inTile, + xai_pArray intermediateArray, + xai_pTile3D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiReduceProdA4D_F16(const xai_pTile4D inTile, + xai_pArray intermediateArray, + xai_pTile4D outTile, + const xai_cnn_reduce_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiQuantizeA3D_F16U8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantizeA3D_F16S8(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F16S16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantizeA4D_F16U8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantizeA4D_F16S8(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F16S16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA3D_U8F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pArray lut, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA3D_S8F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pArray lut, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_S16F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA4D_U8F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + xai_pArray lut, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeA4D_S8F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + xai_pArray lut, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_S16F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeAVQ3D_S8F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pArray outScaleArray, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeAVQ4D_S8F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + xai_pArray outScaleArray, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiSqrtA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiRSqrtA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiEltwisePOWA3D_F16(const xai_pTile3D baseTile, + const xai_pTile3D exponentTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastEltwisePOWA3D_F16(const xai_pTile3D baseTile, + const xai_pTile3D exponentTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseFLOORA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseCEILA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseROUNDA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiDivA3D_F16(const xai_pTile3D numeratorTile, + const xai_pTile3D denominatorTile, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiBroadcastDivA3D_F16(const xai_pTile3D numeratorTile, + const xai_pTile3D denominatorTile, + xai_pTile3D outTile, + const xai_cnn_eltwise_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_cnn_softmaxA3D_F16_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_dim1_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_cnn_softmaxA3D_F16_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_dim2_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_cnn_softmaxA3D_F16_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiSoftMaxA3D_dim3_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_cnn_softmaxA3D_F16_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiLogSoftMaxA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_cnn_softmaxA3D_F16_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiSigmoidA3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiTanh3D_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_3x3_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_3x3_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPoolA3D_MxN_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param, + const xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiAvgPool3D_MxN_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPool3D_MxN_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_pooling_params * param); + +_XAI_API_ XAI_ERR_TYPE xaiMaxPoolWithIdx3D_MxN_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pTile3D idxTile, + const xai_cnn_pooling_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_F16(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_F16(xai_pTile3D dstTile, + const xb_f16 value, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_F16(xai_pTile3D dstTile, + const xb_f16 value, + xai_bool fill_edge_extension); + +/*_XAI_API_ XAI_ERR_TYPE xaiBiasExtend_F16_MOD(const xai_pArray inBiasArray, + xai_pArray outBiasArray);*/ + +/*_XAI_API_ XAI_ERR_TYPE xaiOutScaleExtend_F16_MOD(const xai_pArray outScaleArray, + xai_pArray extendedOutScaleArray);*/ + +/*_XAI_API_ XAI_ERR_TYPE xaiDeConvReOrder4D_F16_NDWH(const xai_pTile4D inTile, + xai_pTile4D subCoeffs[], + xai_pTile4D superCoeffs[], + const xai_cnn_conv_params *param, + const uint8_t transposeCoeffsFlag);*/ + +_XAI_API_ XAI_ERR_TYPE xaiResize3D_SetTileParams(const xai_size3D *inFrame3DSize, + const xai_size3D *outFrame3DSize, + const xai_cnn_data_order dataOrder, + int32_t half_pixel_flag, + xai_cnn_resizeA3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_F16_SetTileParams(const xai_size3D *inFrame3DSize, + const xai_size3D *outFrame3DSize, + const xai_cnn_data_order dataOrder, + int32_t half_pixel_flag, + xai_cnn_interp3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiInterp3D_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_interp3D_params * pparams); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_F16_DWH(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params * pparams); + +_XAI_API_ XAI_ERR_TYPE xaiResizeNearest3D_F16_WHD(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_resize_nearest3D_params *params); + +/*hardSwish FP16*/ +_XAI_API_ XAI_ERR_TYPE xaiHardSwish_F16(const xai_pTile3D inTile, + xai_pTile3D outTile); +/*ArgMin ArgMax*/ +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_F16_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_F16_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmax3D_F16_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_F16_dim1(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_F16_dim2(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiArgmin3D_F16_dim3(const xai_pTile3D inTile, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmax3D_F16_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numLargestVal); + +_XAI_API_ XAI_ERR_TYPE xaiMergeTopKArgmin3D_F16_dim1(const xai_pTile3D inTileIdx, + const xai_pTile3D inTileVal, + const xai_pArray inPtrOffsetArr, + xai_pArray bufArray, + xai_pTile3D outTileIdx, + xai_pTile3D outTileVal, + const uint16_t numSmallestVal); + +/*prelu FP16*/ +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_F16(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_F16_DWH(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiPRELU3D_F16_WHD(const xai_pTile3D inTile, + const xai_pTile3D slopeArray, + xai_pTile3D outTile); + +/*Leaky relu F16*/ +_XAI_API_ XAI_ERR_TYPE xaiLeakyRELU_F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xb_f16 slope); + +/* LUT APIs */ + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Oddsym_F16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Evensym_F16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiLUT3D_Normal_F16(const xai_pTile3D inTile, + const xai_pArray lutArray, + xai_pTile3D outTile, + const xai_cnn_lut_params *params); + +/*Depthwise Conv F16*/ +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxN_F16_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_MxNj2_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3j2_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j1_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5j2_F16_MOW_WHD(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_3x3_F16Ca2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +_XAI_API_ XAI_ERR_TYPE xaiDepthwiseConvolve2D_S_5x5_F16Ca2_MOD_DWH(const xai_pTile3D inTile, + const xai_pTile3D coeffTile, + const xai_pArray biasArray, + xai_pTile3D outTile, + const xai_cnn_conv_params *param); + +/*Batchnorm f16*/ +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_F16_DWH(const xai_pTile3D inTile, + const xai_pArray Alpha, + const xai_pArray Beta, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_F16_WHD(const xai_pTile3D inTile, + const xai_pArray Alpha, + const xai_pArray Beta, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); +_XAI_API_ XAI_ERR_TYPE xaiBatchnorm3D_F16(const xai_pTile3D inTile, + const xai_pArray Alpha, + const xai_pArray Beta, + xai_pTile3D outTile, + const xai_cnn_batchnorm_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_f16_WHD(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_f16_DWH(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16_Dim1(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16_Dim2(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16_Dim3(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcInstanceNormFactor3D_F16(const xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + xai_pArray buffArr, + xai_pArray buffArrSoS, + const xai_cnn_instance_norm_param * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16_Dim1(xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16_Dim2(xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16_Dim3(xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyInstanceNorm3D_F16(xai_pTile3D inTile, + xai_pArray meanArr, + xai_pArray recipArr, + const xai_pArray alphaArr, + const xai_pArray betaArr, + xai_pTile3D outTile, + const xai_cnn_instance_norm_param *params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_F16_WHD(const xai_pTile3D inTile, + xai_pArray buffArrSoS, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_F16_DWH(const xai_pTile3D inTile, + xai_pArray buffArrSoS, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiCalcNormalizeFactor3D_F16(const xai_pTile3D inTile, + xai_pArray buffArrSoS, + xai_pArray pNormScaleArr, + const xai_cnn_normalize3D_params * params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_F16_WHD(xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + xai_pTile3D pOutTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_F16_DWH(xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + xai_pTile3D pOutTile, + const xai_cnn_normalize3D_params *params); + +_XAI_API_ XAI_ERR_TYPE xaiApplyScale3D_F16(xai_pTile3D inTile, + const xai_pArray pNormScaleArr, + xai_pTile3D pOutTile, + const xai_cnn_normalize3D_params *params); + +/**************************** END of FP16 routines declaration ************************************/ +#endif // end of #if (XCHAL_HAVE_VISION_HP_VFPU == 1) + + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) && (XCHAL_HAVE_VISION_HP_VFPU == 1) +/***************************************************************************************************/ +/****************************** Mixed FP16/FP32 routines declaration *****************************/ +/***************************************************************************************************/ + +_XAI_API_ XAI_ERR_TYPE xaiQuantize3D_F32F16(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiQuantize4D_F32F16(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_F16F32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_F16F32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ3D_F16F32(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pArray outScaleArray, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ4D_F16F32(const xai_pTile4D inTile, + xai_pTile4D outTile, + xai_pArray outScaleArray, + const xai_cnn_quantDequantA_params *pparams); + +/**************************** END of Mixed FP16/FP32 routines declaration *************************/ +#endif //end of #if (XCHAL_HAVE_VISION_SP_VFPU == 1) && (XCHAL_HAVE_VISION_HP_VFPU == 1) + + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1) +/***************************************************************************************************/ +/****************************** FP32 routines declaration ****************************************/ +/***************************************************************************************************/ +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_U8F32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_S8F32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize3D_S16F32(const xai_pTile3D inTile, + xai_pTile3D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_U8F32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_S8F32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantize4D_S16F32(const xai_pTile4D inTile, + xai_pTile4D outTile, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ3D_S8F32(const xai_pTile3D inTile, + xai_pTile3D outTile, + xai_pArray outScaleArray, + const xai_cnn_quantDequantA_params *pparams); + +_XAI_API_ XAI_ERR_TYPE xaiDeQuantizeVQ4D_S8F32(const xai_pTile4D inTile, + xai_pTile4D outTile, + xai_pArray outScaleArray, + const xai_cnn_quantDequantA_params *pparams); + + + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdges3D_F32(xai_pTile3D dstTile, + const xai_pArray pArray, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiExtendEdgesConst3D_F32(xai_pTile3D dstTile, + const float value, + xai_size3D frame3DSize); + +_XAI_API_ XAI_ERR_TYPE xaiFillTile3D_F32(xai_pTile3D dstTile, + const float value, + xai_bool fill_edge_extension); +/**************************** END of FP32 routines declaration ************************************/ +#endif //end of #if (XCHAL_HAVE_VISION_SP_VFPU == 1) +#endif //if ((XCHAL_VISION_TYPE >= 6)) +#endif // #ifndef __XAI_CNN_API_H__ diff --git a/backends/cadence/vision/third-party/libxai/include/xai_intrin.h b/backends/cadence/vision/third-party/libxai/include/xai_intrin.h new file mode 100644 index 00000000000..a2c2aa12328 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai/include/xai_intrin.h @@ -0,0 +1,1077 @@ +/* + * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __XAI_INTRIN_H__ +#define __XAI_INTRIN_H__ + +#include +#include + +#ifndef XCHAL_HAVE_VISION +# define XCHAL_HAVE_VISION 0 +#endif + +#if ((XCHAL_HW_REL_LX8 == 1) && (XCHAL_VISION_SIMD16 == 32)) +#define IS_VISION_130 +#endif + +////////// CSTUBS workarounds + +#if defined(_MSC_VER) && !XCHAL_HAVE_VISION +# undef IVP_ABSSUBNX16 +# define IVP_ABSSUBNX16(a, b) IVP_MAXNX16(IVP_SUBNX16(b, a), IVP_SUBNX16(a, b)) +#endif + +#if !defined(__XCC__) && !XCHAL_HAVE_VISION +typedef vselN _xai_intrin_private_xb_vselN; +# undef IVP_SQZN +# define IVP_SQZN(a, b, c) do { _xai_intrin_private_xb_vselN _sqzntmp; CSTUB_(_TIE_xt_ivp32_IVP_SQZN) (_sqzntmp, b, c); a = _sqzntmp; } while (0) +# undef IVP_UNSQZN +# define IVP_UNSQZN(a, b, c) do { _xai_intrin_private_xb_vselN _sqzntmp; CSTUB_(_TIE_xt_ivp32_IVP_UNSQZN) (_sqzntmp, b, c); a = _sqzntmp; } while (0) +#endif + +#if !defined(__XCC__) && XCHAL_HAVE_VISION +#if 0 +#undef IVP_SCATTERNX8U +#undef IVP_SCATTERNX8UT +#define IVP_SCATTERNX8U(val__, ptr__, offs__) \ + { \ + vboolN mask = IVP_LTNX16(0, 1); \ + xb_vecNx16 mask16 = IVP_MOVNX16T(1, 0, mask); \ + xb_vecNx16U offs1 = (offs__); \ + xb_vecNx16 val1 = val__; \ + for (int i = 0; i < 32; i++) \ + { \ + int v = IVP_MOVAVU16(val1); \ + int o = IVP_MOVAVU16(offs1); \ + int m = IVP_MOVAVU16(mask16); \ + if (m) { *((uint8_t *) (ptr__) + o) = v; } \ + val1 = IVP_SELNX16I(0, val1, IVP_SELI_16B_ROTATE_RIGHT_1); \ + mask16 = IVP_SELNX16I(0, mask16, IVP_SELI_16B_ROTATE_RIGHT_1); \ + offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ + } + +#define IVP_SCATTERNX8UT(val__, ptr__, offs__, mask__) \ + { \ + xb_vecNx16 mask16 = IVP_MOVNX16T(1, 0, (mask__)); \ + xb_vecNx16 val = (val__); \ + xb_vecNx16 off = (offs__); \ + for (int i = 0; i < 32; i++) \ + { \ + int v = IVP_MOVAVU16(val); \ + int o = IVP_MOVAVU16(off); \ + int m = IVP_MOVAVU16(mask16); \ + if (m) { *(((uint8_t *) ptr__) + o) = v; } \ + val = IVP_SELNX16I(0, val, IVP_SELI_16B_ROTATE_RIGHT_1); \ + mask16 = IVP_SELNX16I(0, mask16, IVP_SELI_16B_ROTATE_RIGHT_1); \ + off = IVP_SELNX16I(0, off, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ + } + +#undef IVP_SCATTERN_2X32 +#undef IVP_SCATTERN_2X32T +#define IVP_SCATTERN_2X32(val__, ptr__, offs__) \ + { \ + vboolN_2 mask = IVP_LTN_2X32(0, 1); \ + xb_vecN_2x32v mask32 = IVP_MOVN_2X32T(1, 0, mask); \ + xb_vecN_2x32v offs1 = IVP_SRLIN_2X32(offs__, 2); \ + xb_vecN_2x32v val1 = val__; \ + for (int i = 0; i < 16; i++) \ + { \ + int v = IVP_MOVAV32(val1); \ + int o = IVP_MOVAV32(offs1); \ + int m = IVP_MOVAV32(mask32); \ + if (m) { *((ptr__) + o) = v; } \ + val1 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(val1), IVP_SELI_32B_ROTATE_RIGHT_1)); \ + mask32 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(mask32), IVP_SELI_32B_ROTATE_RIGHT_1)); \ + offs1 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(offs1), IVP_SELI_32B_ROTATE_RIGHT_1)); \ + } \ + } + +#define IVP_SCATTERN_2X32T(val__, ptr__, offs__, mask__) \ + { \ + xb_vecN_2x32v mask32 = IVP_MOVN_2X32T(1, 0, mask__); \ + xb_vecN_2x32v offs1 = IVP_SRLIN_2X32(offs__, 2); \ + xb_vecN_2x32v val1 = val__; \ + for (int i = 0; i < 16; i++) \ + { \ + int v = IVP_MOVAV32(val1); \ + int o = IVP_MOVAV32(offs1); \ + int m = IVP_MOVAV32(mask32); \ + if (m) { *((ptr__) + o) = v; } \ + val1 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(val1), IVP_SELI_32B_ROTATE_RIGHT_1)); \ + mask32 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(mask32), IVP_SELI_32B_ROTATE_RIGHT_1)); \ + offs1 = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(offs1), IVP_SELI_32B_ROTATE_RIGHT_1)); \ + } \ + } + +#undef IVP_SCATTER2NX8U_L +#undef IVP_SCATTER2NX8UT_L +#define IVP_SCATTER2NX8U_L(val__, ptr__, offs__) \ + { \ + vbool2N mask = IVP_LT2NX8(0, 1); \ + xb_vec2Nx8 mask8 = IVP_MOV2NX8T(1, 0, mask); \ + xb_vecNx16U offs1 = (offs__); \ + xb_vec2Nx8 val1 = val__; \ + for (int i = 0; i < 32; i++) \ + { \ + int v = IVP_MOVAVU8(val1); \ + int o = IVP_MOVAVU16(offs1); \ + int m = IVP_MOVAVU8(mask8); \ + if (m) { *((uint8_t *) (ptr__) + o) = v; } \ + val1 = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1); \ + mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1); \ + offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ + } + +#define IVP_SCATTER2NX8UT_L(val__, ptr__, offs__, mask__) \ + { \ + vbool2N mask = mask__; \ + xb_vec2Nx8 mask8 = IVP_MOV2NX8T(1, 0, mask); \ + xb_vecNx16U offs1 = (offs__); \ + xb_vec2Nx8 val1 = val__; \ + for (int i = 0; i < 32; i++) \ + { \ + int v = IVP_MOVAVU8(val1); \ + int o = IVP_MOVAVU16(offs1); \ + int m = IVP_MOVAVU8(mask8); \ + if (m) { *((uint8_t *) (ptr__) + o) = v; } \ + val1 = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1); \ + mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1); \ + offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ + } + +#undef IVP_SCATTER2NX8U_H +#undef IVP_SCATTER2NX8UT_H +#define IVP_SCATTER2NX8U_H(val__, ptr__, offs__) \ + { \ + vbool2N mask = IVP_LT2NX8(0, 1); \ + xb_vec2Nx8 mask8 = IVP_MOV2NX8T(1, 0, mask); \ + xb_vecNx16U offs1 = (offs__); \ + xb_vec2Nx8 val1 = val__; \ + \ + val1 = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_32); \ + mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_32); \ + for (int i = 0; i < 32; i++) \ + { \ + int v = IVP_MOVAVU8(val1); \ + int o = IVP_MOVAVU16(offs1); \ + int m = IVP_MOVAVU8(mask8); \ + if (m) { *((uint8_t *) (ptr__) + o) = v; } \ + val1 = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1); \ + mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1); \ + offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ + } + +#define IVP_SCATTER2NX8UT_H(val__, ptr__, offs__, mask__) \ + { \ + vbool2N mask = mask__; \ + xb_vec2Nx8 mask8 = IVP_MOV2NX8T(1, 0, mask); \ + xb_vecNx16U offs1 = (offs__); \ + xb_vec2Nx8 val1 = val__; \ + \ + val1 = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_32); \ + mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_32); \ + for (int i = 0; i < 32; i++) \ + { \ + int v = IVP_MOVAVU8(val1); \ + int o = IVP_MOVAVU16(offs1); \ + int m = IVP_MOVAVU8(mask8); \ + if (m) { *((uint8_t *) (ptr__) + o) = v; } \ + val1 = IVP_SEL2NX8I(0, val1, IVP_SELI_8B_ROTATE_RIGHT_1); \ + mask8 = IVP_SEL2NX8I(0, mask8, IVP_SELI_8B_ROTATE_RIGHT_1); \ + offs1 = IVP_SELNX16I(0, offs1, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ + } + +#undef IVP_GATHERNX8UT_V +#define IVP_GATHERNX8UT_V(pdst, offs, mask, dly) IVP_MOVNX16T(IVP_GATHERNX8U_V((pdst), (offs), (dly)), 0, mask) + +#undef IVP_GATHERNX16T_V +#define IVP_GATHERNX16T_V(pdst, offs, mask, dly) IVP_MOVNX16T(IVP_GATHERNX16_V((pdst), (offs), (dly)), 0, mask) + +#undef IVP_GATHERN_2X32T_V +#define IVP_GATHERN_2X32T_V(pdst, offs, mask, dly) IVP_MOVN_2X32T(IVP_GATHERN_2X32_V((pdst), (offs), (dly)), 0, mask) +#endif // #if 0 +#endif //!defined(__XCC__) && XCHAL_HAVE_VISION + +#if XCHAL_VISION_QUAD_MAC_TYPE == 0 +#ifndef IVP_MULQA2N8XR8 +#define IVP_MULQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_) { \ + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \ + IVP_MULA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0)); \ + IVP_MULA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1)); \ + IVP_MULA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2)); \ + IVP_MULA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3)); \ +} +#endif + +#ifndef IVP_MULUSQA2N8XR8 +#define IVP_MULUSQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_) { \ + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \ + IVP_MULUSA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0)); \ + IVP_MULUSA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1)); \ + IVP_MULUSA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2)); \ + IVP_MULUSA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3)); \ +} +#endif + +#if 0 // Currently disabled as there is no usecase. Kept it so that it can be used in future if required. +#ifndef IVP_MULSUQ2N8XR8 +static inline xb_vec2Nx24 IVP_MULSUQ2N8XR8(xb_vec2Nx8 _dvec3_, xb_vec2Nx8 _dvec2_, xb_vec2Nx8 _dvec1_, xb_vec2Nx8 _dvec0_, int32_t _scalar32_) +{ + xb_vec2Nx24 _dacc_; + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); + _dacc_ = IVP_MULUS2NX8(IVP_REP2NX8(dvecS, 0), _dvec0_); + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 1), _dvec1_); + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 2), _dvec2_); + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 3), _dvec3_); + return(_dacc_); +} +#endif + +#ifndef IVP_MULSUQA2N8XR8 +#define IVP_MULSUQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_) { \ + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \ + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 0), _dvec0_); \ + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 1), _dvec1_); \ + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 2), _dvec2_); \ + IVP_MULUSA2NX8(_dacc_, IVP_REP2NX8(dvecS, 3), _dvec3_); \ +} +#endif + +#ifndef IVP_MULUUQA2N8XR8 +#define IVP_MULUUQA2N8XR8(_dacc_, _dvec3_, _dvec2_, _dvec1_, _dvec0_, _scalar32_) { \ + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \ + IVP_MULUUA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0)); \ + IVP_MULUUA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1)); \ + IVP_MULUUA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2)); \ + IVP_MULUUA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3)); \ +} +#endif +#endif + +#ifndef IVP_MUL4TA2N8XR8 +#define IVP_MUL4TA2N8XR8(_dacc_, _dvec1_, _dvec0_, _scalar32_) { \ + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \ + IVP_MULA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0)); \ + IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1)); \ + IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2)); \ + IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3)); \ +} +#endif + +#ifndef IVP_MULUS4TA2N8XR8 +#define IVP_MULUS4TA2N8XR8(_dacc_, _dvec1_, _dvec0_, _scalar32_) { \ + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); \ + IVP_MULUSA2NX8(_dacc_, _dvec0_, IVP_REP2NX8(dvecS, 0)); \ + IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1)); \ + IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2)); \ + IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3)); \ +} +#endif + +#ifndef IVP_MUL4T2N8XR8 +static inline xb_vec2Nx24 IVP_MUL4T2N8XR8(xb_vec2Nx8U _dvec1_, xb_vec2Nx8U _dvec0_, int _scalar32_) +{ + xb_vec2Nx24 _dacc_; + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); + _dacc_ = IVP_MUL2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0)); + IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1)); + IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2)); + IVP_MULA2NX8(_dacc_, IVP_SEL2NX8I(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3)); + return(_dacc_); +} +#endif + +#ifndef IVP_MULUS4T2N8XR8 +static inline xb_vec2Nx24 IVP_MULUS4T2N8XR8(xb_vec2Nx8U _dvec1_, xb_vec2Nx8U _dvec0_, int _scalar32_) +{ + xb_vec2Nx24 _dacc_; + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); + _dacc_ = IVP_MULUS2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0)); + IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_1), IVP_REP2NX8(dvecS, 1)); + IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_2), IVP_REP2NX8(dvecS, 2)); + IVP_MULUSA2NX8(_dacc_, IVP_SEL2NX8UI(_dvec1_, _dvec0_, IVP_SELI_8B_ROTATE_RIGHT_3), IVP_REP2NX8(dvecS, 3)); + return(_dacc_); +} +#endif + +#ifndef IVP_MULQ2N8XR8 +static inline xb_vec2Nx24 IVP_MULQ2N8XR8(xb_vec2Nx8 _dvec3_, xb_vec2Nx8 _dvec2_, xb_vec2Nx8 _dvec1_, xb_vec2Nx8 _dvec0_, int32_t _scalar32_) +{ + xb_vec2Nx24 _dacc_; + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); + _dacc_ = IVP_MUL2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0)); + IVP_MULA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1)); + IVP_MULA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2)); + IVP_MULA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3)); + return(_dacc_); +} +#endif + +#ifndef IVP_MULUSQ2N8XR8 +static inline xb_vec2Nx24 IVP_MULUSQ2N8XR8(xb_vec2Nx8U _dvec3_, xb_vec2Nx8U _dvec2_, xb_vec2Nx8U _dvec1_, xb_vec2Nx8U _dvec0_, int32_t _scalar32_) +{ + xb_vec2Nx24 _dacc_; + xb_vec2Nx8 dvecS = IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_MOVVA32(_scalar32_))); + _dacc_ = IVP_MULUS2NX8(_dvec0_, IVP_REP2NX8(dvecS, 0)); + IVP_MULUSA2NX8(_dacc_, _dvec1_, IVP_REP2NX8(dvecS, 1)); + IVP_MULUSA2NX8(_dacc_, _dvec2_, IVP_REP2NX8(dvecS, 2)); + IVP_MULUSA2NX8(_dacc_, _dvec3_, IVP_REP2NX8(dvecS, 3)); + return(_dacc_); +} +#endif +#endif //#if XCHAL_VISION_QUAD_MAC_TYPE == 0 + +#if XCHAL_HAVE_SUPERGATHER == 0 + +#ifdef IVP_GATHERANX8S +#undef IVP_GATHERANX8S +static inline xb_vecNx16 IVP_GATHERANX8S(const signed char * _base, xb_vecNx16U _offsets) +{ + const signed char *_basePtr = _base; \ + xb_vecNx16U _offsetsVec = _offsets; \ + xb_vecNx16 _dataVec = (xb_vecNx16) 0; \ + int _i; \ + for (_i = 0; _i < 32; _i++) + { + \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + xb_int8 gdata = IVP_LS2NX8_X(_basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16I(IVP_MOVNX16_FROM8(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } + return(_dataVec); +} +#endif + +#ifdef IVP_GATHERANX8U +#undef IVP_GATHERANX8U +static inline xb_vecNx16U IVP_GATHERANX8U(const unsigned char * _base, xb_vecNx16U _offsets) +{ + const unsigned char *_basePtr = _base; + xb_vecNx16U _offsetsVec = _offsets; + xb_vecNx16U _dataVec = (xb_vecNx16U) 0; + int _i; + for (_i = 0; _i < 32; _i++) + { + unsigned short offset = IVP_MOVAVU16(_offsetsVec); + xb_int8U gdata = IVP_LS2NX8U_X(_basePtr, offset); + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); + _dataVec = IVP_SELNX16UI(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROM8U(gdata)), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); + } + return(_dataVec); +} +#endif + +#ifndef IVP_GATHERD2NX8_L +#define IVP_GATHERD2NX8_L(_gsr) IVP_SEL2NX8I((xb_vec2Nx8) 0, IVP_MOV2NX8_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0) +#endif + +#ifndef IVP_GATHERD2NX8_H +#define IVP_GATHERD2NX8_H(_vec, _gsr) do { xb_vec2Nx8 tmp = IVP_SEL2NX8I((xb_vec2Nx8) 0, IVP_MOV2NX8_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \ + _vec = IVP_SEL2NX8I(tmp, _vec, IVP_SELI_EXTRACT_LO_HALVES); \ +} while (0) +#endif + +#ifndef IVP_GATHERD2NX8U_H +#define IVP_GATHERD2NX8U_H(_vec, _gsr) do { xb_vec2Nx8U tmp = IVP_SEL2NX8UI((xb_vec2Nx8U) 0, IVP_MOV2NX8U_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); \ + _vec = IVP_SEL2NX8UI(tmp, _vec, IVP_SELI_EXTRACT_LO_HALVES); } while (0) +#endif + +#ifndef IVP_GATHERD2NX8U_L +#define IVP_GATHERD2NX8U_L(_gsr) IVP_SEL2NX8UI((xb_vec2Nx8) 0, IVP_MOV2NX8_FROMNX16(_gsr), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0) +#endif + +#ifdef IVP_SCATTERNX8U +#undef IVP_SCATTERNX8U +#define IVP_SCATTERNX8U(_dataIn, _base, _offsets) do { \ + xb_vecNx16U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned char *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SSNX8U_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERNX8UT +#undef IVP_SCATTERNX8UT +#define IVP_SCATTERNX8UT(_dataIn, _base, _offsets, _vbr) do { \ + xb_vecNx16U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned char *_basePtr = _base; \ + xb_vecNx16 _condsVec = IVP_MOVNX16T((xb_vecNx16) 1, (xb_vecNx16) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + short cond = IVP_MOVAV16(_condsVec); \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + if (cond) { \ + IVP_SSNX8U_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SELNX16I(_condsVec, _condsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + + +#ifdef IVP_SCATTER2NX8_L +#undef IVP_SCATTER2NX8_L +#define IVP_SCATTER2NX8_L(_dataIn, _base, _offsets) do { \ + xb_vec2Nx8 _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + signed char *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SS2NX8_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + + +#ifdef IVP_SCATTER2NX8T_L +#undef IVP_SCATTER2NX8T_L +#define IVP_SCATTER2NX8T_L(_dataIn, _base, _offsets, _vbr) do { \ + xb_vec2Nx8 _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + signed char *_basePtr = _base; \ + xb_vec2Nx8 _condsVec = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + signed char cond = IVP_MOVAV8(_condsVec); \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + if (cond) { \ + IVP_SS2NX8_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTER2NX8U_L +#undef IVP_SCATTER2NX8U_L +#define IVP_SCATTER2NX8U_L(_dataIn, _base, _offsets) do { \ + xb_vec2Nx8U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned char *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SS2NX8U_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTER2NX8UT_L +#undef IVP_SCATTER2NX8UT_L +#define IVP_SCATTER2NX8UT_L(_dataIn, _base, _offsets, _vbr) do { \ + xb_vec2Nx8U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned char *_basePtr = _base; \ + xb_vec2Nx8 _condsVec = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + signed char cond = IVP_MOVAV8(_condsVec); \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + if (cond) { \ + IVP_SS2NX8U_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTER2NX8_H +#undef IVP_SCATTER2NX8_H +#define IVP_SCATTER2NX8_H(_dataIn, _base, _offsets) do { \ + xb_vec2Nx8 _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + signed char *_basePtr = _base; \ + _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SS2NX8_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTER2NX8U_H +#undef IVP_SCATTER2NX8U_H +#define IVP_SCATTER2NX8U_H(_dataIn, _base, _offsets) do { \ + xb_vec2Nx8U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned char *_basePtr = _base; \ + _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SS2NX8U_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTER2NX8T_H +#undef IVP_SCATTER2NX8T_H +#define IVP_SCATTER2NX8T_H(_dataIn, _base, _offsets, _vbr) do { \ + xb_vec2Nx8 _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + signed char *_basePtr = _base; \ + xb_vec2Nx8 _condsVec = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr); \ + _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES); \ + _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_EXTRACT_HI_HALVES); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + signed char cond = IVP_MOVAV8(_condsVec); \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + if (cond) { \ + IVP_SS2NX8_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8I(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTER2NX8UT_H +#undef IVP_SCATTER2NX8UT_H +#define IVP_SCATTER2NX8UT_H(_dataIn, _base, _offsets, _vbr) do { \ + xb_vec2Nx8U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned char *_basePtr = _base; \ + xb_vec2Nx8 _condsVec = IVP_MOV2NX8T((xb_vec2Nx8) 1, (xb_vec2Nx8) 0, _vbr); \ + _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_EXTRACT_HI_HALVES); \ + _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_EXTRACT_HI_HALVES); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + signed char cond = IVP_MOVAV8(_condsVec); \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + if (cond) { \ + IVP_SS2NX8U_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SEL2NX8UI(_dataVec, _dataVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SEL2NX8I(_condsVec, _condsVec, IVP_SELI_8B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERNX16 +#undef IVP_SCATTERNX16 +#define IVP_SCATTERNX16(_dataIn, _base, _offsets) do { \ + xb_vecNx16 _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + short *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SSNX16_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16I(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERNX16U +#undef IVP_SCATTERNX16U +#define IVP_SCATTERNX16U(_dataIn, _base, _offsets) do { \ + xb_vecNx16U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned short *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + IVP_SSNX16U_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERNX16T +#undef IVP_SCATTERNX16T +#define IVP_SCATTERNX16T(_dataIn, _base, _offsets, _vbr) do { \ + xb_vecNx16 _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + short *_basePtr = (short *) _base; \ + xb_vecNx16 _condsVec = IVP_MOVNX16T((xb_vecNx16) 1, (xb_vecNx16) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + short cond = IVP_MOVAV16(_condsVec); \ + if (cond) { \ + IVP_SSNX16_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16I(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SELNX16I(_condsVec, _condsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERNX16UT +#undef IVP_SCATTERNX16UT +#define IVP_SCATTERNX16UT(_dataIn, _base, _offsets, _vbr) do { \ + xb_vecNx16U _dataVec = _dataIn; \ + xb_vecNx16U _offsetsVec = _offsets; \ + unsigned short *_basePtr = _base; \ + xb_vecNx16 _condsVec = IVP_MOVNX16T((xb_vecNx16) 1, (xb_vecNx16) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 32; _i++) { \ + unsigned short offset = IVP_MOVAVU16(_offsetsVec); \ + short cond = IVP_MOVAV16(_condsVec); \ + if (cond) { \ + IVP_SSNX16U_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELNX16UI(_dataVec, _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SELNX16I(_condsVec, _condsVec, IVP_SELI_16B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERN_2X32 +#undef IVP_SCATTERN_2X32 +#define IVP_SCATTERN_2X32(_dataIn, _base, _offsets) do { \ + xb_vecN_2x32v _dataVec = _dataIn; \ + xb_vecN_2x32Uv _offsetsVec = _offsets; \ + int *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 16; _i++) { \ + unsigned int offset = IVP_MOVAV32(_offsetsVec); \ + IVP_SSN_2X32_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELN_2X32I(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERN_2X32T +#undef IVP_SCATTERN_2X32T +#define IVP_SCATTERN_2X32T(_dataIn, _base, _offsets, _vbr) do { \ + xb_vecN_2x32v _dataVec = _dataIn; \ + xb_vecN_2x32Uv _offsetsVec = _offsets; \ + int *_basePtr = _base; \ + xb_vecN_2x32v _condsVec = IVP_MOVN_2X32T((xb_vecN_2x32v) 1, (xb_vecN_2x32v) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 16; _i++) { \ + int cond = IVP_MOVAV32(_condsVec); \ + unsigned int offset = IVP_MOVAV32(_offsetsVec); \ + if (cond) { \ + IVP_SSN_2X32_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELN_2X32I(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SELN_2X32I(_condsVec, _condsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERN_2X32U +#undef IVP_SCATTERN_2X32U +#define IVP_SCATTERN_2X32U(_dataIn, _base, _offsets) do { \ + xb_vecN_2x32Uv _dataVec = _dataIn; \ + xb_vecN_2x32Uv _offsetsVec = _offsets; \ + unsigned int *_basePtr = _base; \ + int _i; \ + for (_i = 0; _i < 16; _i++) { \ + unsigned int offset = IVP_MOVAV32(_offsetsVec); \ + IVP_SSN_2X32U_X(_dataVec, _basePtr, offset); \ + _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELN_2X32UI(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_SCATTERN_2X32UT +#undef IVP_SCATTERN_2X32UT +#define IVP_SCATTERN_2X32UT(_dataIn, _base, _offsets, _vbr) do { \ + xb_vecN_2x32Uv _dataVec = _dataIn; \ + xb_vecN_2x32Uv _offsetsVec = _offsets; \ + unsigned int *_basePtr = _base; \ + xb_vecN_2x32v _condsVec = IVP_MOVN_2X32T((xb_vecN_2x32v) 1, (xb_vecN_2x32v) 0, _vbr); \ + int _i; \ + for (_i = 0; _i < 16; _i++) { \ + int cond = IVP_MOVAV32(_condsVec); \ + unsigned int offset = IVP_MOVAV32(_offsetsVec); \ + if (cond) { \ + IVP_SSN_2X32U_X(_dataVec, _basePtr, offset); } \ + _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + _dataVec = IVP_SELN_2X32UI(_dataVec, _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + _condsVec = IVP_SELN_2X32I(_condsVec, _condsVec, IVP_SELI_32B_ROTATE_RIGHT_1); \ + } \ +} while (0) +#endif + +#ifdef IVP_GATHERANX16U +#undef IVP_GATHERANX16U +static inline xb_vecNx16 IVP_GATHERANX16U(const uint16_t *_base, xb_vecNx16U _offsets) +{ + const unsigned short *_basePtr = _base; + xb_vecNx16U _offsetsVec = _offsets; + xb_vecNx16U _dataVec = (xb_vecNx16U) 0; + int _i; + for (_i = 0; _i < 32; _i++) + { + unsigned short offset = IVP_MOVAVU16(_offsetsVec); + xb_int16U gdata = IVP_LSNX16U_X(_basePtr, offset); + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); + _dataVec = IVP_SELNX16UI(IVP_MOVNX16U_FROM16U(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); + } + return(IVP_MOVNX16_FROMNX16U(_dataVec)); +} +#endif + +#ifdef IVP_GATHERANX16 +#undef IVP_GATHERANX16 +static inline xb_vecNx16 IVP_GATHERANX16(const int16_t *_base, xb_vecNx16U _offsets) +{ + const short *_basePtr = _base; + xb_vecNx16U _offsetsVec = _offsets; + xb_vecNx16 _dataVec = (xb_vecNx16) 0; + int _i; + for (_i = 0; _i < 32; _i++) + { + unsigned short offset = IVP_MOVAVU16(_offsetsVec); + xb_int16 gdata = IVP_LSNX16_X(_basePtr, offset); + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); + _dataVec = IVP_SELNX16I(IVP_MOVNX16_FROM16(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); + } + return(_dataVec); +} +#endif + +#ifdef IVP_GATHERANX16T +#undef IVP_GATHERANX16T +static inline xb_vecNx16 IVP_GATHERANX16T(const int16_t *_base, xb_vecNx16U _offsets, vboolN _vbr) +{ + const short *_basePtr = _base; + xb_vecNx16U _offsetsVec = _offsets; + vboolN _boolVec = _vbr; + xb_vecNx16 _dataVec = (xb_vecNx16) 0; + int _i; + for (_i = 0; _i < 32; _i++) + { + unsigned short offset = IVP_MOVAVU16(_offsetsVec); + xb_int16 gdata = IVP_LSNX16_X(_basePtr, offset); + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); + _dataVec = IVP_SELNX16I(IVP_MOVNX16_FROM16(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); + } + return(IVP_MOVNX16T(_dataVec, (xb_vecNx16) 0, _boolVec)); +} +#endif + +#ifdef IVP_GATHERANX16UT +#undef IVP_GATHERANX16UT +static inline xb_vecNx16 IVP_GATHERANX16UT(const uint16_t *_base, xb_vecNx16U _offsets, vboolN _vbr) +{ + const unsigned short *_basePtr = _base; + xb_vecNx16U _offsetsVec = _offsets; + vboolN _boolVec = _vbr; + xb_vecNx16U _dataVec = (xb_vecNx16U) 0; + int _i; + for (_i = 0; _i < 32; _i++) + { + unsigned short offset = IVP_MOVAVU16(_offsetsVec); + xb_int16U gdata = IVP_LSNX16U_X(_basePtr, offset); + _offsetsVec = IVP_SELNX16UI(_offsetsVec, _offsetsVec, IVP_SELI_16B_ROTATE_RIGHT_1); + _dataVec = IVP_SELNX16UI(IVP_MOVNX16U_FROM16U(gdata), _dataVec, IVP_SELI_16B_ROTATE_RIGHT_1); + } + return(IVP_MOVNX16_FROMNX16U(IVP_MOVNX16UT(_dataVec, (xb_vecNx16U) 0, _boolVec))); +} +#endif + +#ifdef IVP_GATHERAN_2X32 +#undef IVP_GATHERAN_2X32 +static inline xb_vecNx16 IVP_GATHERAN_2X32(const int32_t *_base, xb_vecN_2x32Uv _offsets) +{ + const int *_basePtr = _base; + xb_vecN_2x32Uv _offsetsVec = _offsets; + xb_vecN_2x32v _dataVec = (xb_vecN_2x32v) 0; + int _i; + for (_i = 0; _i < 16; _i++) + { + unsigned int offset = IVP_MOVAV32(_offsetsVec); + xb_int32v gdata = IVP_LSN_2X32_X(_basePtr, offset); + _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); + _dataVec = IVP_SELN_2X32I(IVP_MOVN_2X32_FROM32(gdata), _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1); + } + return(IVP_MOVNX16_FROMN_2X32(_dataVec)); +} +#endif + +#ifdef IVP_GATHERAN_2X32T +#undef IVP_GATHERAN_2X32T +static inline xb_vecNx16 IVP_GATHERAN_2X32T(const int32_t *_base, xb_vecN_2x32Uv _offsets, vboolN_2 _vbr) +{ + const int *_basePtr = _base; + xb_vecN_2x32Uv _offsetsVec = _offsets; + vboolN_2 _boolVec = _vbr; + xb_vecN_2x32v _dataVec = (xb_vecN_2x32v) 0; + int _i; + for (_i = 0; _i < 16; _i++) + { + unsigned int offset = IVP_MOVAV32(_offsetsVec); + xb_int32v gdata = IVP_LSN_2X32_X(_basePtr, offset); + _offsetsVec = IVP_SELN_2X32UI(_offsetsVec, _offsetsVec, IVP_SELI_32B_ROTATE_RIGHT_1); + _dataVec = IVP_SELN_2X32I(IVP_MOVN_2X32_FROM32(gdata), _dataVec, IVP_SELI_32B_ROTATE_RIGHT_1); + } + return(IVP_MOVNX16_FROMN_2X32(IVP_MOVN_2X32T(_dataVec, (xb_vecN_2x32v) 0, _boolVec))); +} +#endif + +#ifdef IVP_GATHERNX8UT_V +#undef IVP_GATHERNX8UT_V +#define IVP_GATHERNX8UT_V(pdst, offs, mask, dly) IVP_MOVNX16T(IVP_GATHERNX8U_V((pdst), (offs), (dly)), 0, mask) +#endif +#endif // XCHAL_HAVE_SUPERGATHER == 0 + +////////// protos extension + +// 32-way wide vector (48-bit) element high 16-bits output to narrow (16-bit) output vector register +#ifndef IVP_PACKHNX48 +# define IVP_PACKHNX48(vec) IVP_PACKVRNR2NX24_1(IVP_MOV2NX24_FROMNX48(vec), 8) +#endif + +// reinterpret 64 8-bit elements as 16 32-bit elements +#ifndef IVP_MOVN_2X32_FROM2NX8 +# define IVP_MOVN_2X32_FROM2NX8(vec) IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(vec)) +#endif + +// reinterpret 16 32-bit elements as 64 8-bit elements +#ifndef IVP_MOV2NX8_FROMN_2X32 +# define IVP_MOV2NX8_FROMN_2X32(vec) IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(vec)) +#endif + +#ifndef IVP_SELN_2X32I +# define IVP_SELN_2X32I(a, b, i) IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32(a), IVP_MOVNX16_FROMN_2X32(b), i)) +#endif + +// 0 to 63 sequence xb_vec2Nx8U vector +#ifndef IVP_SEQ2NX8U +# define IVP_SEQ2NX8U() IVP_MOV2NX8U_FROMNX16(IVP_ADDNX16U(256, IVP_MULNX16UPACKL(514, IVP_SEQNX16()))) +#endif + +// 64-way 8-bit zero +#ifndef IVP_ZERO2NX8U +# define IVP_ZERO2NX8U() IVP_MOV2NX8U_FROMNX16(IVP_ZERONX16()) +#endif + +// 16-way 32-bit zero +#ifndef IVP_ZERON_2X32U +# define IVP_ZERON_2X32U() IVP_MOVN_2X32U_FROMNX16(IVP_ZERONX16()) +#endif + +// 64-way 24-bit zero +#ifndef IVP_ZERO2NX24 +# define IVP_ZERO2NX24() IVP_MOV2NX24_FROMNX48(IVP_ZERONX48()) +#endif + +// 32-way 48-bit zero +#ifndef IVP_ZERONX48 +# if XCHAL_HAVE_VISION +# define IVP_ZERONX48() (IVP_CVT48UNX32L(IVP_ZERON_2X32U())) +# else +# define IVP_ZERONX48() (IVP_MOVWVL(IVP_ZERONX16())) +# endif +#endif + +////////// compatibility between IVPEP - VP5 +#if XCHAL_HAVE_VISION + +typedef xb_vecNx16 vsaN; + +# define IVP_MOVWVL(a) IVP_CVT48UNX32L(a) +# define IVP_MOVV2WHH(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24HH(IVP_MOV2NX24_FROMNX48(a))) +# define IVP_MOVV2WHL(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24HL(IVP_MOV2NX24_FROMNX48(a))) +# define IVP_MOVV2WLH(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24LH(IVP_MOV2NX24_FROMNX48(a))) +# define IVP_MOVV2WLL(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32S2NX24LL(IVP_MOV2NX24_FROMNX48(a))) +# define IVP_MOVSVWH(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32SNX48H(a)) +# define IVP_MOVSVWL(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32SNX48L(a)) +# define IVP_MOVVWHH(a) IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48HH(a)) +# define IVP_MOVVWHL(a) IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48HL(a)) +# define IVP_MOVVWLH(a) IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48LH(a)) +# define IVP_MOVVWLL(a) IVP_MOVNX16_FROM2NX8(IVP_CVT64SNX48LL(a)) +# define IVP_MOVV2WL(a) IVP_CVT16U2NX24L(IVP_MOV2NX24_FROMNX48(a)) +# define IVP_MOVV2WH(a) IVP_CVT16U2NX24H(IVP_MOV2NX24_FROMNX48(a)) +# define IVP_MOVVWL(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32UNX48L(a)) +# define IVP_MOVVWH(a) IVP_MOVNX16_FROMN_2X32(IVP_CVT32UNX48H(a)) +# define IVP_MOVSV2WL(a) IVP_CVT16S2NX24L(IVP_MOV2NX24_FROMNX48(a)) +# define IVP_MOVSV2WH(a) IVP_CVT16S2NX24H(IVP_MOV2NX24_FROMNX48(a)) +# define IVP_MOV2W2VL(a, b) IVP_MOVNX48_FROM2NX24(IVP_CVT24UNX32L(IVP_MOVN_2X32_FROMNX16(a), IVP_MOVN_2X32_FROMNX16(b))) +# define IVP_MOVSWV(a, b) IVP_CVT48SNX32(IVP_MOVN_2X32_FROMNX16(a), IVP_MOVN_2X32_FROMNX16(b)) +# define IVP_MOVS2WV(a, b) IVP_MOVNX48_FROM2NX24(IVP_CVT24S2NX16(a, b)) +# define IVP_MOVWV(a, b) IVP_CVT48UNX32(IVP_MOVN_2X32_FROMNX16(a), IVP_MOVN_2X32_FROMNX16(b)) + +# define IVP_MOVVVS(a) (a) +# define IVP_MOVVSA32(a) IVP_MOVVA16(a) +# define IVP_MOVVSV(vr, sa) (vr) // sa is always zero in XI, if not zero -> use IVP_MOVVSELNX16 +# define IVP_MOVVSELNX16(vr, sa) IVP_SRLINX16(vr, sa) +# define IVP_MOVVSVADDNX16(a, b, c, d) { a = c; c = IVP_ADDNX16(c, b); } // d is always zero in XI +# define IVP_MOVPVSV(a, b, c, d) { xb_vec2Nx8 t = IVP_SRLI2NX8(c, d); a = IVP_UNPKS2NX8_1(t); b = IVP_UNPKS2NX8_0(t); } + +#undef IVP_LSNX8U_XP +#undef IVP_LSNX8U_IP +#undef IVP_LSNX8U_X +#undef IVP_LSNX8U_I +# define IVP_LSNX8U_XP(a, b, c) do { xb_int8U tmp; IVP_LS2NX8U_XP(tmp, b, c); a = IVP_MOVNX16_FROM8U(tmp); } while (0) +# define IVP_LSNX8U_IP(a, b, c) do { xb_int8U tmp; IVP_LS2NX8U_IP(tmp, b, c); a = IVP_MOVNX16_FROM8U(tmp); } while (0) +# define IVP_LSNX8U_X(b, c) IVP_MOVNX16_FROM8U(IVP_LS2NX8U_X(b, c)) +# define IVP_LSNX8U_I(b, c) IVP_MOVNX16_FROM8U(IVP_LS2NX8U_I(b, c)) + +# define IVP_PACKLNX48_L(a) IVP_CVT32UNX48L(a) +# define IVP_PACKLNX48_H(a) IVP_CVT32UNX48H(a) + +# define IVP_SA2NX8UPOS_FP IVP_SAPOS2NX8U_FP +# define IVP_SAN_2X32POS_FP IVP_SAPOSN_2X32_FP +# define IVP_SANX16POS_FP IVP_SAPOSNX16_FP +# define IVP_SANX16UPOS_FP IVP_SAPOSNX16U_FP +# define IVP_SANX8UPOS_FP IVP_SAPOSNX8U_FP +# define IVP_SAV2NX8POS_FP IVP_SAPOS2NX8_FP +# define IVP_SAV2NX8UPOS_FP IVP_SAPOS2NX8U_FP +# define IVP_SAVN_2X32POS_FP IVP_SAPOSN_2X32_FP +# define IVP_SAVNX16POS_FP IVP_SAPOSNX16_FP +# define IVP_SAVNX16UPOS_FP IVP_SAPOSNX16U_FP +# define IVP_SAVNX8UPOS_FP IVP_SAPOSNX8U_FP +# define IVP_LAVNX8U_PP IVP_LANX8U_PP +# define IVP_LAVNX16_PP IVP_LANX16_PP + +# define IVP_RADDURNX16(b) ((int) IVP_RADDUNX16(b)) +# define IVP_RADDRNX16(b) ((int) IVP_RADDNX16(b)) +# define IVP_ADDSNX16F(a, b, c, d) IVP_ADDSNX16T(a, b, c, IVP_NOTBN(d)) +# define IVP_ADDNX16F(a, b, c, d) IVP_ADDNX16T(a, b, c, IVP_NOTBN(d)) +# define IVP_SUBNX16F(a, b, c, d) IVP_SUBNX16T(a, b, c, IVP_NOTBN(d)) +# define IVP_NEGNX16F(a, b, c) IVP_NEGNX16T(a, b, IVP_NOTBN(c)) +# define IVP_NEGSNX16F(a, b, c) IVP_NEGSNX16T(a, b, IVP_NOTBN(c)) +# define IVP_RMINNX16F(b, c) IVP_RMINNX16T(b, IVP_NOTBN(c)) +# define IVP_MINUNX16F(a, b, c, d) IVP_MINUNX16T(a, b, c, IVP_NOTBN(d)) +# define IVP_SVNX8UF_XP(a, b, c, d) IVP_SVNX8UT_XP(a, b, c, IVP_NOTBN(d)) +# define IVP_SVNX8UF_I(a, b, c, d) IVP_SVNX8UT_I(a, b, c, IVP_NOTBN(d)) +# define IVP_SVNX16F_XP(a, b, c, d) IVP_SVNX16T_XP(a, b, c, IVP_NOTBN(d)) +# define IVP_SVNX16F_I(a, b, c, d) IVP_SVNX16T_I(a, b, c, IVP_NOTBN(d)) +#endif + +#if XCHAL_HAVE_VISION +# define IVP__LSNX16_XP(a, b, c) do { xb_int16 tmp; IVP_LSNX16_XP(tmp, b, c); a = IVP_MOVNX16_FROM16(tmp); } while (0) +#else +# define IVP__LSNX16_XP IVP_LSNX16_XP +#endif + +#if XCHAL_HAVE_VISION +# define IVP__LSNX16_IP(a, b, c) do { xb_int16 tmp; IVP_LSNX16_IP(tmp, b, c); a = IVP_MOVNX16_FROM16(tmp); } while (0) +#else +# define IVP__LSNX16_IP IVP_LSNX16_IP +#endif + +#if XCHAL_HAVE_VISION +# define IVP__DSELNX16_2X16(a, b, c, d, e, f) { \ + xb_vecNx16 _v0, _v1; \ + _v0 = d; \ + _v1 = c; \ + a = IVP_SELNX16(_v1, _v0, e); \ + b = IVP_SELNX16(_v1, _v0, f); \ +} +#else +# define IVP__DSELNX16_2X16 IVP_DSELNX16 +#endif + +#if XCHAL_HAVE_VISION +# define IVP__SEL2NX8_2X16(b, c, d, e) IVP_SEL2NX8(b, c, IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(d), IVP_MOV2NX8_FROMNX16(e), IVP_SELI_8B_INTERLEAVE_1_EVEN)) +#else +# define IVP__SEL2NX8_2X16 IVP_SEL2NX8 +#endif + +////////// compatibility for RF-2014.0 IVP-EP cores + +#ifndef IVP_SVN_2X32_IP +#define IVP_SVN_2X32_IP(a, b, c) \ + do { \ + xb_vecNx16 *bb = (xb_vecNx16 *) b; \ + IVP_SVNX16_IP(IVP_MOVNX16_FROMN_2X32(a), bb, c); \ + b = (xb_vecN_2x32v *) bb; \ + } while (0) +#endif + +#ifndef IVP_SVN_2X32_XP +#define IVP_SVN_2X32_XP(a, b, c) \ + do { \ + xb_vecNx16 *bb = (xb_vecNx16 *) b; \ + IVP_SVNX16_XP(IVP_MOVNX16_FROMN_2X32(a), bb, c); \ + b = (xb_vecN_2x32v *) bb; \ + } while (0) +#endif + +#ifndef IVP_LVN_2X32_IP +#define IVP_LVN_2X32_IP(a, b, c) \ + do { \ + xb_vecNx16 *bb = (xb_vecNx16 *) b; \ + xb_vecNx16 aa; IVP_LVNX16_IP(aa, bb, c); \ + a = IVP_MOVN_2X32_FROMNX16(aa); \ + b = (xb_vecN_2x32v *) bb; \ + } while (0) +#endif + +#ifndef IVP_LVN_2X32_XP +#define IVP_LVN_2X32_XP(a, b, c) \ + do { \ + xb_vecNx16 *bb = (xb_vecNx16 *) b; \ + xb_vecNx16 aa; IVP_LVNX16_XP(aa, bb, c); \ + a = IVP_MOVN_2X32_FROMNX16(aa); \ + b = (xb_vecN_2x32v *) bb; \ + } while (0) +#endif + +////////// select/shuffle indexes +#if XCHAL_HAVE_VISION +#define XAI_DSEL_16B_ROTATE_LEFT(n) IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16((0x4000 - 2 * (((n) << 8) + (n))))) +#define XAI_DSEL_16B_ROTATE_RIGHT(n) IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16((0x3F00 + 2 * (((n) << 8) + (n))))) + +#define XAI_DSEL_16B_ROTATE_RIGHT_2_1 IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(2 * (1 + ((1 + 1) << 8)))) +#define XAI_DSEL_16B_ROTATE_RIGHT_4_3 IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(2 * (3 + ((3 + 1) << 8)))) +#define XAI_DSEL_32B_ROTATE_RIGHT_2_1 IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(4 * (1 + ((1 + 1) << 8)))) +#define XAI_DSEL_32B_ROTATE_RIGHT_4_3 IVP_AVGU2NX8(IVP_SEQ2NX8(), IVP_MOV2NX8_FROMNX16(4 * (3 + ((3 + 1) << 8)))) +#endif + +#define OFFSET_PTR_NX8(ptr, nrows, stride, in_row_offset) ((xb_vecNx8 *) ((int8_t *) (ptr) + (in_row_offset) + (nrows) * (stride))) +#define OFFSET_PTR_NX8U(ptr, nrows, stride, in_row_offset) ((xb_vecNx8U *) ((uint8_t *) (ptr) + (in_row_offset) + (nrows) * (stride))) +#define OFFSET_PTR_2NX8(ptr, nrows, stride, in_row_offset) ((xb_vec2Nx8 *) ((int8_t *) (ptr) + (in_row_offset) + (nrows) * (stride))) +#define OFFSET_PTR_2NX8U(ptr, nrows, stride, in_row_offset) ((xb_vec2Nx8U *) ((uint8_t *) (ptr) + (in_row_offset) + (nrows) * (stride))) +#define OFFSET_PTR_NX16(ptr, nrows, stride, in_row_offset) ((xb_vecNx16 *) ((int16_t *) (ptr) + (in_row_offset) + (nrows) * (stride))) +#define OFFSET_PTR_NX16U(ptr, nrows, stride, in_row_offset) ((xb_vecNx16U *) ((uint16_t *) (ptr) + (in_row_offset) + (nrows) * (stride))) +#endif diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h new file mode 100644 index 00000000000..afcf5b87786 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_common.h @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2021 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CNN_API_COMMON_H__ +#define __XAI_CNN_API_COMMON_H__ + +#include "xai_cnn_api_params.h" +#include "xai_config_api.h" +#include "xai_core_api.h" +#include "xai_tile_manager.h" +#include +#include + + +// ElementWise APIs +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMul3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseOr3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAnd3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseXor3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_S8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_U8_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_S16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_U16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_S32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_U32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_F16_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); +#endif //#if XCHAL_HAVE_VISION_HP_VFPU == 1 + + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseAdd3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseSub3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMax3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseMin3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseEqual3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseGreaterThan3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); + + +_XAI_API_ XAI_ERR_TYPE xaiEltwiseLessThan3D_F32_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile); +#endif //#if XCHAL_HAVE_VISION_SP_VFPU == 1 + +_XAI_API_ XAI_ERR_TYPE xaiCast3D(const xai_pTile3D inTile, + xai_pTile3D outTile); +#endif //#ifndef __XAI_CNN_API_COMMON_H__ diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h new file mode 100644 index 00000000000..51d4cb75358 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_api_params.h @@ -0,0 +1,1886 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CNN_API_PARAMS_H__ +#define __XAI_CNN_API_PARAMS_H__ + +#include "xai_config_api.h" +#include "xai_core_api.h" +#include "xai_tile_manager.h" +#include +#include + +#define TFL_QUANTIZATION_MODE_BIT_EXACT 1 +#define TFL_QUANTIZATION_MODE_APPROXIMATE 2 +#define XNNC_QUANTIZATION_MODE 3 +#define TFL_USE_ACT_TIE 4 + +#ifndef FLT_MIN +#define FLT_MIN (1.175494351e-38F) +#endif + +#ifndef FLT_MAX +#define FLT_MAX (3.402823466e+38F) +#endif + + +#if defined(__clang__) && (defined(GLOW_BUILD) || defined(GLOW_WITH_XTENSA)) + +#ifdef XCHAL_HAVE_VISION_HP_VFPU +#undef XCHAL_HAVE_VISION_HP_VFPU +#define XCHAL_HAVE_VISION_HP_VFPU 1 +#endif + +#ifdef XCHAL_IVPN_SIMD_WIDTH +#if (XCHAL_IVPN_SIMD_WIDTH == 64) +#define XCHAL_HAVE_CONNX_B_HP_VFPU 1 +#define XCHAL_HAVE_VISION_SP_VFPU 1 +#define XCHAL_HAVE_BBENEP_SP_VFPU 1 +#endif +#endif + +#include + +#if (XCHAL_HAVE_VISION_HP_VFPU == 1) +# undef ENABLE_F16_PRECISION +# define ENABLE_F16_PRECISION 1 +#endif + +#ifdef BIT_EXACT_FP16_REF +# undef BIT_EXACT_FP16_REF +#endif + +#ifdef BIT_EXACT_FP32_REF +# undef BIT_EXACT_FP32_REF +#endif + +#include "fp16.h" +#include +#include "shared/Common/Float16.h" +#undef xb_f16 +typedef shared::float16 xb_f16; + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + +#include +#undef ENABLE_F32_PRECISION +#define ENABLE_F32_PRECISION 1 + +#ifdef BIT_EXACT_FP32_REF +# undef BIT_EXACT_FP32_REF +#endif +#endif // #if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + +// MLIR builds cannot use the contents of these include files, but they +// currently do not need the symbols defined in them. +#elif !defined(MLIR_BUILD) +#ifndef XAI_REF_ONLY_COMPILATION +#include +#if (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5) +#include +#else +#include +#endif +#endif +#if (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION) && !defined(GENERIC_XTENSA_BUILD)) +typedef _Float16 xb_f16; +#elif defined(GENERIC_BUILD) +typedef float xb_f16; +#endif +#elif defined(MLIR_BUILD) && defined(XAI_REF_ONLY_COMPILATION) +typedef float xb_f16; +#endif // #if defined(__clang__) && (defined(GLOW_BUILD) || defined(GLOW_WITH_XTENSA)) + +#if defined (BIT_EXACT_FP16_REF) +#undef XAI_F16_half +#define XAI_F16_half IVP_CVTF16F32(0.5f) +#else +#undef XAI_F16_half +#define XAI_F16_half (xb_f16) (0.5f) +#endif + +#define XAI_F16_MIN_FLT (float) (-65504.0f) +#define XAI_F16_MAX_FLT (float) (65504.0f) +#define XAI_F32_MIN_FLT (float) (-FLT_MAX) +#define XAI_F32_MAX_FLT (float) (FLT_MAX) + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_F16_MIN (xb_f16) (-65504.0f) +#define XAI_F16_MAX (xb_f16) (65504.0f) +#define XAI_F16_MIN_VECN (xb_vecNxf16) (-65504.0f) +#define XAI_F16_MAX_VECN (xb_vecNxf16) (65504.0f) +#define XAI_F16_MIN_VECN32 (xb_vecN_2xf32) (-65504.0f) +#define XAI_F16_MAX_VECN32 (xb_vecN_2xf32) (65504.0f) +#define XAI_F16_POS_MIN (xb_f16) (6.10352e-5F) +#endif + +/***************************************************************************************/ +/* log2 function is not defined in Visual Studio 2012 but available in higher versions */ +/* _MSC_VER version number check to be performed for visual studio version */ +/* If _MSC_VER <= (Visual Studio 2012) version log2 function is enabled */ +/* Visual Studio Version Information : */ +/* MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015) */ +/* MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013) */ +/* MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012) */ +/* MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010) */ +/* MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008) */ +/* MSVC++ 8.0 _MSC_VER == 1400 (Visual Studio 2005) */ +/***************************************************************************************/ + +#if defined(_MSC_VER) +#if _MSC_VER <= 1700 +#include "math.h" +static _XAI_INLINE_ double log2(double number) +{ + /* Calculates log2 of number. */ + return(log(number) / log(2.0)); +} +#endif +#endif + +#define CNN_CONV_FLAG_RELU (1 << 0) +#define CNN_CONV_FLAG_LEFTEDGE (1 << 1) +#define CNN_CONV_FLAG_TOPEDGE (1 << 2) +#define CNN_CONV_FLAG_INPUT (1 << 3) +#define CNN_CONV_FLAG_OUTPUT (1 << 4) + +#define CNN_POOLING_TOPEDGE_FLAG (1 << 1) +#define CNN_POOLING_LEFTEDGE_FLAG (1 << 0) + +#define CNN_NORMALIZE_ALONG_WIDTH (1 << 0) +#define CNN_NORMALIZE_ALONG_HEIGHT (1 << 1) +#define CNN_NORMALIZE_ALONG_DEPTH (1 << 2) +#define CNN_NORMALIZE_ALONG_BATCH (1 << 3) +#define CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT (CNN_NORMALIZE_ALONG_WIDTH | CNN_NORMALIZE_ALONG_HEIGHT) +#define CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH (CNN_NORMALIZE_ALONG_WIDTH | CNN_NORMALIZE_ALONG_HEIGHT | CNN_NORMALIZE_ALONG_DEPTH) +#define CNN_NORMALIZE_CHANNEL_SHARE_FLAG (1 << 0) + +#define CNN_GLOBAL_POOL_INTERMEDIATE_TILE 0 +#define CNN_GLOBAL_POOL_FIRST_TILE 1 +#define CNN_GLOBAL_POOL_LAST_TILE 2 +#define CNN_GLOBAL_POOL_FIRST_AND_LAST_TILE 3 + +#define CNN_NORMALIZE_INTERMEDIATE_TILE 0 +#define CNN_NORMALIZE_FIRST_TILE 1 +#define CNN_NORMALIZE_LAST_TILE 2 +#define CNN_NORMALIZE_FIRST_AND_LAST_TILE 3 +#define CNN_EXP_LUT_PARTITION 3 + +typedef struct +{ + float widthScale; + float heightScale; + float xshift; + float yshift; + int8_t alignCorners; + int8_t halfPixelCenters; + int32_t zeroPtInput; + int32_t zeroPtOutput; + int32_t outMultiplier; + int32_t outShift; + int32_t widthFrame; + int32_t heightFrame; + int8_t quantization_mode; +} xai_cnn_resizeA3D_params; + +#define XAI_CNN_RESIZE3D_GET_WIDTHSCALE(x) ((x)->widthScale) +#define XAI_CNN_RESIZE3D_GET_HEIGHTSCALE(x) ((x)->heightScale) +#define XAI_CNN_RESIZE3D_GET_XSHIFT(x) ((x)->xshift) +#define XAI_CNN_RESIZE3D_GET_YSHIFT(x) ((x)->yshift) +#define XAI_CNN_RESIZE3D_GET_FLAG_ALIGN_CORNERS(x) ((x)->alignCorners) +#define XAI_CNN_RESIZE3D_GET_FLAG_HALF_PIXEL_CENTERS(x) ((x)->halfPixelCenters) +#define XAI_CNN_RESIZE3D_GET_ZERO_POINT_INPUT(x) ((x)->zeroPtInput) +#define XAI_CNN_RESIZE3D_GET_ZERO_POINT_OUTPUT(x) ((x)->zeroPtOutput) +#define XAI_CNN_RESIZE3D_GET_OUT_MULTIPLIER(x) ((x)->outMultiplier) +#define XAI_CNN_RESIZE3D_GET_OUT_SHIFT(x) ((x)->outShift) +#define XAI_CNN_RESIZE3D_GET_WIDTHFRAME(x) ((x)->widthFrame) +#define XAI_CNN_RESIZE3D_GET_HEIGHTFRAME(x) ((x)->heightFrame) + +#define XAI_CNN_RESIZE3D_SET_WIDTHSCALE(x, v) ((x)->widthScale = (v)) +#define XAI_CNN_RESIZE3D_SET_HEIGHTSCALE(x, v) ((x)->heightScale = (v)) +#define XAI_CNN_RESIZE3D_SET_XSHIFT(x, v) ((x)->xshift = (v)) +#define XAI_CNN_RESIZE3D_SET_YSHIFT(x, v) ((x)->yshift = (v)) +#define XAI_CNN_RESIZE3D_SET_FLAG_ALIGN_CORNERS(x, v) ((x)->alignCorners = v) +#define XAI_CNN_RESIZE3D_SET_FLAG_HALF_PIXEL_CENTERS(x, v) ((x)->halfPixelCenters = v) +#define XAI_CNN_RESIZE3D_SET_ZERO_POINT_INPUT(x, v) ((x)->zeroPtInput = (v)) +#define XAI_CNN_RESIZE3D_SET_ZERO_POINT_OUTPUT(x, v) ((x)->zeroPtOutput = (v)) +#define XAI_CNN_RESIZE3D_SET_OUT_MULTIPLIER(x, v) ((x)->outMultiplier = (v)) +#define XAI_CNN_RESIZE3D_SET_OUT_SHIFT(x, v) ((x)->outShift = (v)) +#define XAI_CNN_RESIZE3D_SET_WIDTHFRAME(x, v) ((x)->widthFrame = (v)) +#define XAI_CNN_RESIZE3D_SET_HEIGHTFRAME(x, v) ((x)->heightFrame = (v)) + +#define XAI_CNN_RESIZE3D_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_RESIZE3D_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +typedef struct +{ + uint8_t strideX; // Convolution StrideX + uint8_t strideY; // Convolution StrideY + uint8_t accumShift; // Accumulator Shift - Shift to convert accumulator data to 16 bit + uint16_t outputScale; // Amount by which shifted data is scaled + uint8_t outputShift; // Shift amount to convert the scaled data to 16 bit + uint8_t flags; + /* + * -------------------------------------------------------------------------- + * |bit 7 - 5| bit 4 | bit 3 | bit2 | bit1 | bit0 | + * | unused |FC output flag|FC input flag|topEdgeFlag|leftEdgeFlag|Relu Flag | + * -------------------------------------------------------------------------- + */ + uint8_t dilationX; // dilation along kernel width + uint8_t dilationY; // dilation along kernel height + int32_t reluMin; // Minimum clamping limit when bit 0 of flags is set + int32_t reluMax; // Maximum clamping limit when bit 0 of flags is set + int8_t quantization_mode; + int32_t input_offset; + int32_t output_offset; + int32_t coeff_offset; + int32_t outputScaleTFL; + int32_t outputShiftTFL; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 reluMinFlt; + xb_f16 reluMaxFlt; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float reluMinFlt32; + float reluMaxFlt32; +#endif +} xai_cnn_conv_params; + +#define XAI_CNN_CONV_GET_STRIDE(x) ((x)->strideX) +#define XAI_CNN_CONV_SET_STRIDE(x, v) (x)->strideX = (v); (x)->strideY = (v); +#define XAI_CNN_CONV_GET_STRIDEX(x) ((x)->strideX) +#define XAI_CNN_CONV_GET_STRIDEY(x) ((x)->strideY) +#define XAI_CNN_CONV_SET_STRIDE_XY(x, v1, v2) (x)->strideX = (v1); (x)->strideY = (v2); +#define XAI_CNN_CONV_SET_STRIDEX(x, v) (x)->strideX = (v); +#define XAI_CNN_CONV_SET_STRIDEY(x, v) (x)->strideY = (v); +#define XAI_CNN_CONV_GET_ACCUM_SHIFT(x) ((x)->accumShift) +#define XAI_CNN_CONV_SET_ACCUM_SHIFT(x, v) ((x)->accumShift = (v)) +#define XAI_CNN_CONV_GET_OUTPUT_SCALE(x) ((x)->outputScale) +#define XAI_CNN_CONV_SET_OUTPUT_SCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_CONV_GET_OUTPUT_SHIFT(x) ((x)->outputShift) +#define XAI_CNN_CONV_SET_OUTPUT_SHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_CONV_GET_FLAGS(x) ((x)->flags) +#define XAI_CNN_CONV_SET_FLAGS(x, v) ((x)->flags = (v)) +#define XAI_CNN_CONV_GET_FLAG_RELU(x) ((x)->flags & CNN_CONV_FLAG_RELU) +#define XAI_CNN_CONV_SET_FLAG_RELU(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_RELU)) +#define XAI_CNN_CONV_RESET_FLAG_RELU(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_RELU)) +#define XAI_CNN_CONV_GET_FLAG_LEFTEDGE(x) ((x)->flags & CNN_CONV_FLAG_LEFTEDGE) +#define XAI_CNN_CONV_SET_FLAG_LEFTEDGE(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_LEFTEDGE)) +#define XAI_CNN_CONV_RESET_FLAG_LEFTEDGE(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_LEFTEDGE)) +#define XAI_CNN_CONV_GET_FLAG_TOPEDGE(x) ((x)->flags & CNN_CONV_FLAG_TOPEDGE) +#define XAI_CNN_CONV_SET_FLAG_TOPEDGE(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_TOPEDGE)) +#define XAI_CNN_CONV_RESET_FLAG_TOPEDGE(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_TOPEDGE)) +#define XAI_CNN_CONV_GET_FLAG_INPUT(x) ((x)->flags & CNN_CONV_FLAG_INPUT) +#define XAI_CNN_CONV_SET_FLAG_INPUT(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_INPUT)) +#define XAI_CNN_CONV_RESET_FLAG_INPUT(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_INPUT)) +#define XAI_CNN_CONV_GET_FLAG_OUTPUT(x) ((x)->flags & CNN_CONV_FLAG_OUTPUT) +#define XAI_CNN_CONV_SET_FLAG_OUTPUT(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_OUTPUT)) +#define XAI_CNN_CONV_RESET_FLAG_OUTPUT(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_OUTPUT)) +#define XAI_CNN_CONV_GET_DILATION(x) ((x)->dilationX) +#define XAI_CNN_CONV_SET_DILATION(x, v) (x)->dilationX = (v); (x)->dilationY = (v); +#define XAI_CNN_CONV_GET_DILATIONX(x) ((x)->dilationX) +#define XAI_CNN_CONV_SET_DILATIONX(x, v) ((x)->dilationX = (v)) +#define XAI_CNN_CONV_GET_DILATIONY(x) ((x)->dilationY) +#define XAI_CNN_CONV_SET_DILATIONY(x, v) ((x)->dilationY = (v)) +#define XAI_CNN_CONV_SET_DILATION_XY(x, v1, v2) (x)->dilationX = (v1); (x)->dilationY = (v2); +#define XAI_CNN_CONV_GET_RELU_MIN(x) ((x)->reluMin) +#define XAI_CNN_CONV_SET_RELU_MIN(x, v) ((x)->reluMin = (v)) +#define XAI_CNN_CONV_GET_RELU_MAX(x) ((x)->reluMax) +#define XAI_CNN_CONV_SET_RELU_MAX(x, v) ((x)->reluMax = (v)) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_CONV_GET_RELU_MIN_FLT(x) ((x)->reluMinFlt) +#define XAI_CNN_CONV_SET_RELU_MIN_FLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_CONV_GET_RELU_MAX_FLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_CONV_SET_RELU_MAX_FLT(x, v) ((x)->reluMaxFlt = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_CONV_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_CONV_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_CONV_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_CONV_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#endif +#define XAI_CNN_CONV_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_CONV_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_CONV_GET_INPUT_OFFSET(x) ((x)->input_offset) +#define XAI_CNN_CONV_SET_INPUT_OFFSET(x, v) ((x)->input_offset = (v)) +#define XAI_CNN_CONV_GET_OUTPUT_OFFSET(x) ((x)->output_offset) +#define XAI_CNN_CONV_SET_OUTPUT_OFFSET(x, v) ((x)->output_offset = (v)) +#define XAI_CNN_CONV_GET_COEFF_OFFSET(x) ((x)->coeff_offset) +#define XAI_CNN_CONV_SET_COEFF_OFFSET(x, v) ((x)->coeff_offset = (v)) +#define XAI_CNN_CONV_GET_OUTPUT_SCALE_TFL(x) ((x)->outputScaleTFL) +#define XAI_CNN_CONV_SET_OUTPUT_SCALE_TFL(x, v) ((x)->outputScaleTFL = (v)) +#define XAI_CNN_CONV_GET_OUTPUT_SHIFT_TFL(x) ((x)->outputShiftTFL) +#define XAI_CNN_CONV_SET_OUTPUT_SHIFT_TFL(x, v) ((x)->outputShiftTFL = (v)) + +typedef struct +{ + uint8_t strideX; // Convolution StrideX + uint8_t strideY; // Convolution StrideY + uint8_t accumShift; // Accumulator Shift - Shift to convert accumulator data to 16 bit + uint16_t outputScale; // Amount by which shifted data is scaled + uint8_t outputShift; // Shift amount to convert the scaled data to 16 bit + uint8_t flags; + /* + * -------------------------------------------------------------------------- + * |bit 7 - 5| bit 4 | bit 3 | bit2 | bit1 | bit0 | + * | unused |FC output flag|FC input flag|topEdgeFlag|leftEdgeFlag|Relu Flag | + * -------------------------------------------------------------------------- + */ + uint8_t dilationX; // dilation along kernel width + uint8_t dilationY; // dilation along kernel height + uint8_t depthMultiplier; // factor by which output depth size varies from input depth size + int32_t reluMin; // Minimum clamping limit when bit 0 of flags is set + int32_t reluMax; // Maximum clamping limit when bit 0 of flags is set +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 reluMinFlt; + xb_f16 reluMaxFlt; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float reluMinFlt32; + float reluMaxFlt32; +#endif + int8_t quantization_mode; + int32_t input_offset; + int32_t output_offset; +} xai_cnn_depthwiseDilatedConv_params; + +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(x) ((x)->strideX) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDE(x, v) (x)->strideX = (v); (x)->strideY = (v) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(x) ((x)->strideX) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(x) ((x)->strideY) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDE_XY(x, v1, v2) (x)->strideX = (v1); (x)->strideY = (v2) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDEX(x, v) (x)->strideX = (v); +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_STRIDEY(x, v) (x)->strideY = (v); +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_ACCUM_SHIFT(x) ((x)->accumShift) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_ACCUM_SHIFT(x, v) ((x)->accumShift = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_OUTPUT_SCALE(x) ((x)->outputScale) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_OUTPUT_SCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_OUTPUT_SHIFT(x) ((x)->outputShift) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_OUTPUT_SHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAGS(x) ((x)->flags) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAGS(x, v) ((x)->flags = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_RELU(x) ((x)->flags & CNN_CONV_FLAG_RELU) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_RELU(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_RELU)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_RELU(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_RELU)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(x) ((x)->flags & CNN_CONV_FLAG_LEFTEDGE) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_LEFTEDGE(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_LEFTEDGE)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_LEFTEDGE(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_LEFTEDGE)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_TOPEDGE(x) ((x)->flags & CNN_CONV_FLAG_TOPEDGE) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_TOPEDGE(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_TOPEDGE)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_TOPEDGE(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_TOPEDGE)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_INPUT(x) ((x)->flags & CNN_CONV_FLAG_INPUT) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_INPUT(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_INPUT)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_INPUT(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_INPUT)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_OUTPUT(x) ((x)->flags & CNN_CONV_FLAG_OUTPUT) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_FLAG_OUTPUT(x) ((x)->flags = ((x)->flags | CNN_CONV_FLAG_OUTPUT)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_RESET_FLAG_OUTPUT(x) ((x)->flags = ((x)->flags & ~CNN_CONV_FLAG_OUTPUT)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(x) ((x)->dilationX) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATION(x, v) (x)->dilationX = (v); (x)->dilationY = (v) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(x) ((x)->dilationX) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATIONX(x, v) ((x)->dilationX = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(x) ((x)->dilationY) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATIONY(x, v) ((x)->dilationY = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DILATION_XY(x, v1, v2) (x)->dilationX = (v1); (x)->dilationY = (v2) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(x) ((x)->depthMultiplier) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_DEPTH_MULTIPLIER(x, v) ((x)->depthMultiplier = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(x) ((x)->reluMin) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MIN(x, v) ((x)->reluMin = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(x) ((x)->reluMax) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MAX(x, v) ((x)->reluMax = (v)) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN_FLT(x) ((x)->reluMinFlt) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MIN_FLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX_FLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MAX_FLT(x, v) ((x)->reluMaxFlt = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#endif +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_INPUT_OFFSET(x) ((x)->input_offset) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_INPUT_OFFSET(x, v) ((x)->input_offset = (v)) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_GET_OUTPUT_OFFSET(x) ((x)->output_offset) +#define XAI_CNN_DEPTHWISE_DILATED_CONV_SET_OUTPUT_OFFSET(x, v) ((x)->output_offset = (v)) + +typedef struct +{ + uint8_t kernelWidth; // Normalization window width + uint8_t kernelHeight; // Normalization window height + int16_t sigmaScale; // Factor used to scale the sum of squares of data under the normalization window + uint8_t sigmaScaleShift; // Shift to map the scaled sum of squares to LUT index + uint8_t outputShift; // Output shift +} xai_cnn_lrn_spatial_params; + +typedef struct +{ + uint8_t kernelDepth; // Normalization window depth + int16_t sigmaScale; // Factor used to scale the sum of squares of data under the normalization window + uint8_t sigmaScaleShift; // Shift to map the scaled sum of squares to LUT index + uint8_t outputShift; // Output shift +} xai_cnn_lrn_depth_params; + +#define XAI_CNN_LRN_GET_KERNELWIDTH(x) ((x)->kernelWidth) +#define XAI_CNN_LRN_SET_KERNELWIDTH(x, v) ((x)->kernelWidth = (v)) +#define XAI_CNN_LRN_GET_KERNELHEIGHT(x) ((x)->kernelHeight) +#define XAI_CNN_LRN_SET_KERNELHEIGHT(x, v) ((x)->kernelHeight = (v)) +#define XAI_CNN_LRN_GET_KERNELDEPTH(x) ((x)->kernelDepth) +#define XAI_CNN_LRN_SET_KERNELDEPTH(x, v) ((x)->kernelDepth = (v)) +#define XAI_CNN_LRN_GET_SIGMASCALE(x) ((x)->sigmaScale) +#define XAI_CNN_LRN_SET_SIGMASCALE(x, v) ((x)->sigmaScale = (v)) +#define XAI_CNN_LRN_GET_SIGMASCALESHIFT(x) ((x)->sigmaScaleShift) +#define XAI_CNN_LRN_SET_SIGMASCALESHIFT(x, v) ((x)->sigmaScaleShift = (v)) +#define XAI_CNN_LRN_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_LRN_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) + +typedef struct +{ + int16_t kernelWidth; + int16_t kernelHeight; + uint8_t strideX; // The number of points by which the pooling window + // is shifted along X direction. + uint8_t strideY; // The number of points by which the pooling window + // is shifted along Y direction. + uint8_t edgeFlag; // edgeFlag is applicable only for pooling with even kernel sizes. Least significant bit(LSB) + // of the flag represents whether minimum left edge size required for pooling should be + // greater than the minimum right edge size required. The bit adjacent to LSB decides whether + // minimum top edge size required should be greater than minimum bottom edge size. + int16_t outputScale; // Normalizer value to be multiplied with sum of elements under the pooling window + uint8_t outputShift; // Shift to be applied on the normalized sum to obtain the average + int32_t fixUpInit; // the fixUp term that is used to incorporte Zero Points + uint8_t enableRelu; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 reluMinFlt; + xb_f16 reluMaxFlt; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float reluMinFlt32; + float reluMaxFlt32; +#endif + int8_t quantization_mode; + int32_t reluMin; + int32_t reluMax; +} xai_cnn_pooling_params; + +#define XAI_CNN_POOLING_GET_KERNELWIDTH(x) ((x)->kernelWidth) +#define XAI_CNN_POOLING_SET_KERNELWIDTH(x, v) ((x)->kernelWidth = (v)) +#define XAI_CNN_POOLING_GET_KERNELHEIGHT(x) ((x)->kernelHeight) +#define XAI_CNN_POOLING_SET_KERNELHEIGHT(x, v) ((x)->kernelHeight = (v)) +#define XAI_CNN_POOLING_GET_STRIDE(x) ((x)->strideX) +#define XAI_CNN_POOLING_SET_STRIDE(x, v) (x)->strideX = (v); (x)->strideY = (v); +#define XAI_CNN_POOLING_GET_STRIDEX(x) ((x)->strideX) +#define XAI_CNN_POOLING_GET_STRIDEY(x) ((x)->strideY) +#define XAI_CNN_POOLING_SET_STRIDE_XY(x, v1, v2) (x)->strideX = (v1); (x)->strideY = (v2); +#define XAI_CNN_POOLING_SET_STRIDEX(x, v) (x)->strideX = (v); +#define XAI_CNN_POOLING_SET_STRIDEY(x, v) (x)->strideY = (v); +#define XAI_CNN_POOLING_GET_TOPEDGE_FLAG(x) ((x)->edgeFlag & CNN_POOLING_TOPEDGE_FLAG) +#define XAI_CNN_POOLING_SET_TOPEDGE_FLAG(x) ((x)->edgeFlag = ((x)->edgeFlag | CNN_POOLING_TOPEDGE_FLAG)) +#define XAI_CNN_POOLING_RESET_TOPEDGE_FLAG(x) ((x)->edgeFlag = ((x)->edgeFlag & ~CNN_POOLING_TOPEDGE_FLAG)) +#define XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(x) ((x)->edgeFlag & CNN_POOLING_LEFTEDGE_FLAG) +#define XAI_CNN_POOLING_SET_LEFTEDGE_FLAG(x) ((x)->edgeFlag = ((x)->edgeFlag | CNN_POOLING_LEFTEDGE_FLAG)) +#define XAI_CNN_POOLING_RESET_LEFTEDGE_FLAG(x) ((x)->edgeFlag = ((x)->edgeFlag & ~CNN_POOLING_LEFTEDGE_FLAG)) +#define XAI_CNN_POOLING_GET_OUTPUTSCALE(x) ((x)->outputScale) +#define XAI_CNN_POOLING_SET_OUTPUTSCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_POOLING_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_POOLING_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_POOLING_GET_FIXUPINIT(x) ((x)->fixUpInit) +#define XAI_CNN_POOLING_SET_FIXUPINIT(x, v) ((x)->fixUpInit = (v)) +#define XAI_CNN_POOLING_GET_RELUFLAG(x) ((x)->enableRelu) +#define XAI_CNN_POOLING_SET_RELUFLAG(x, v) ((x)->enableRelu = (v)) + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_POOLING_GET_RELUMINFLT(x) ((x)->reluMinFlt) +#define XAI_CNN_POOLING_SET_RELUMINFLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_POOLING_GET_RELUMAXFLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_POOLING_SET_RELUMAXFLT(x, v) ((x)->reluMaxFlt = (v)) +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_POOLING_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_POOLING_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_POOLING_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_POOLING_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#endif +#define XAI_CNN_POOLING_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_POOLING_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_POOLING_GET_RELUMIN(x) ((x)->reluMin) +#define XAI_CNN_POOLING_SET_RELUMIN(x, v) ((x)->reluMin = (v)) +#define XAI_CNN_POOLING_GET_RELUMAX(x) ((x)->reluMax) +#define XAI_CNN_POOLING_SET_RELUMAX(x, v) ((x)->reluMax = (v)) + +typedef struct +{ + int16_t outputScale; //Normalizer value to be multiplied with sum of elements under the pooling window + uint8_t tileFlag; // indicates whether the given tile is a first tile, last tile or neither of those + uint8_t outputShift; //Shift to be applied on the normalized sum to obtain the average + uint8_t accShift; //accumulator shift that is applied to bring the data to S32 range + int32_t fixUpInit; //the fixUp term that is used to incorporte Zero Points +} xai_cnn_global_pooling_params; + +#define XAI_CNN_GLOBAL_POOLING_GET_OUTPUTSCALE(x) ((x)->outputScale) +#define XAI_CNN_GLOBAL_POOLING_SET_OUTPUTSCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_GLOBAL_POOLING_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_GLOBAL_POOLING_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_GLOBAL_POOLING_GET_ACCSHIFT(x) ((x)->accShift) +#define XAI_CNN_GLOBAL_POOLING_SET_ACCSHIFT(x, v) ((x)->accShift = (v)) +#define XAI_CNN_GLOBAL_POOLING_GET_TILE_FLAG(x) ((x)->tileFlag) +#define XAI_CNN_GLOBAL_POOLING_SET_TILE_FLAG(x, v) ((x)->tileFlag = (v)) +#define XAI_CNN_GLOBAL_POOLING_GET_FIXUPINIT(x) ((x)->fixUpInit) +#define XAI_CNN_GLOBAL_POOLING_SET_FIXUPINIT(x, v) ((x)->fixUpInit = (v)) + +typedef struct +{ + uint16_t spatialScaleX; // Multiplicative spatial scale factor to translate ROI coords from their + // input scale to the scale used when pooling + //Spatial scale in the X direction + uint16_t spatialScaleY; //Spatial scale in the Y direction + uint16_t spatialScaleShiftX; //Shift value to apply for spatial scale operations in the X direction + uint16_t spatialScaleShiftY; //Shift value to apply for spatial scale operations in the Y direction + int32_t pooledHeight; //Total number of fixed output points along height dimension from ROI + int32_t pooledWidth; //Total number of fixed output points along width dimension from ROI + uint16_t oneByPooledHeightScale; //Reciprocal of pooledHeight represented in U15 range + uint16_t oneByPooledWidthScale; //Reciprocal of pooledWidth represented in U15 range + uint16_t oneByPooledHeightShift; //Shift value to normalize after operating with oneByPooledHeightScale variable + uint16_t oneByPooledWidthShift; //Shift value to normalize after operating with oneByPooledWidthScale variable +} xai_cnn_roi_pooling_params; + +#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEX(x) ((x)->spatialScaleX) +#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALEX(x, v) ((x)->spatialScaleX = (v)) +#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEY(x) ((x)->spatialScaleY) +#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALEY(x, v) ((x)->spatialScaleY = (v)) +#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTX(x) ((x)->spatialScaleShiftX) +#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALE_SHIFTX(x, v) ((x)->spatialScaleShiftX = (v)) +#define XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTY(x) ((x)->spatialScaleShiftY) +#define XAI_CNN_ROI_POOLING_SET_SPATIAL_SCALE_SHIFTY(x, v) ((x)->spatialScaleShiftY = (v)) +#define XAI_CNN_ROI_POOLING_GET_POOLED_WIDTH(x) ((x)->pooledWidth) +#define XAI_CNN_ROI_POOLING_SET_POOLED_WIDTH(x, v) ((x)->pooledWidth = (v)) +#define XAI_CNN_ROI_POOLING_GET_POOLED_HEIGHT(x) ((x)->pooledHeight) +#define XAI_CNN_ROI_POOLING_SET_POOLED_HEIGHT(x, v) ((x)->pooledHeight = (v)) +#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SCALE(x) ((x)->oneByPooledWidthScale) +#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_WIDTH_SCALE(x, v) ((x)->oneByPooledWidthScale = (v)) +#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SCALE(x) ((x)->oneByPooledHeightScale) +#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_HEIGHT_SCALE(x, v) ((x)->oneByPooledHeightScale = (v)) +#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SHIFT(x) ((x)->oneByPooledWidthShift) +#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_WIDTH_SHIFT(x, v) ((x)->oneByPooledWidthShift = (v)) +#define XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SHIFT(x) ((x)->oneByPooledHeightShift) +#define XAI_CNN_ROI_POOLING_SET_ONE_BY_POOLED_HEIGHT_SHIFT(x, v) ((x)->oneByPooledHeightShift = (v)) + +typedef struct +{ + uint8_t outputShift; /* No. of output bits to be right shifted. */ + uint8_t qFactorOutput; /* No. of bits scaling applied to the reciprocal of the sum of exp(x)*/ + int16_t maxVal; /* global max value in the 3D tile */ + int8_t axis; /* dimension along which softmax is applied*/ + int8_t quantization_mode; + int32_t diff_min; //defines minimum difference with respect to the maximum value + int32_t inputScale; //significand of BetaScaleQ5.26 + int32_t inputShift; //exponent of BetaScaleQ5.26 +} xai_cnn_softmax_params; + +#define XAI_CNN_SOFTMAX_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_SOFTMAX_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_SOFTMAX_GET_QFACTOROUTPUT(x) ((x)->qFactorOutput) +#define XAI_CNN_SOFTMAX_SET_QFACTOROUTPUT(x, v) ((x)->qFactorOutput = (v)) +#define XAI_CNN_SOFTMAX_GET_MAXVAL(x) ((x)->maxVal) +#define XAI_CNN_SOFTMAX_SET_MAXVAL(x, v) ((x)->maxVal = (v)) +#define XAI_CNN_SOFTMAX_GET_AXIS(x) ((x)->axis) +#define XAI_CNN_SOFTMAX_SET_AXIS(x, v) ((x)->axis = (v)) +#define XAI_CNN_SOFTMAX_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_SOFTMAX_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_SOFTMAX_PARAMS_GET_DIFF_MIN(x) ((x)->diff_min) +#define XAI_CNN_SOFTMAX_PARAMS_SET_DIFF_MIN(x, v) ((x)->diff_min = (v)) +#define XAI_CNN_SOFTMAX_GET_INPUT_SCALE(x) ((x)->inputScale) +#define XAI_CNN_SOFTMAX_SET_INPUT_SCALE(x, v) ((x)->inputScale = (v)) +#define XAI_CNN_SOFTMAX_GET_INPUT_SHIFT(x) ((x)->inputShift) +#define XAI_CNN_SOFTMAX_SET_INPUT_SHIFT(x, v) ((x)->inputShift = (v)) + +typedef struct +{ + int8_t quantization_mode; + // tfl related parameters + int32_t inputZeroPoint; + int32_t outputZeroPoint; + int16_t reluishMultiplierFixedpointS16; + int32_t reluishMultiplierExponent; + int16_t outputMultiplierFixedpointS16; + int32_t outputMultiplierExponent; +} xai_cnn_tfl_hardSwish_params; + +#define XAI_CNN_HARDSWISH_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_HARDSWISH_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_HARDSWISH_GET_INPUT_ZERO_POINT(x) ((x)->inputZeroPoint) +#define XAI_CNN_HARDSWISH_SET_INPUT_ZERO_POINT(x, v) ((x)->inputZeroPoint = (v)) +#define XAI_CNN_HARDSWISH_GET_OUTPUT_ZERO_POINT(x) ((x)->outputZeroPoint) +#define XAI_CNN_HARDSWISH_SET_OUTPUT_ZERO_POINT(x, v) ((x)->outputZeroPoint = (v)) +#define XAI_CNN_HARDSWISH_GET_RELUISH_MULTIPLIER_FIXED_POINT_S16(x) ((x)->reluishMultiplierFixedpointS16) +#define XAI_CNN_HARDSWISH_SET_RELUISH_MULTIPLIER_FIXED_POINT_S16(x, v) ((x)->reluishMultiplierFixedpointS16 = (v)) +#define XAI_CNN_HARDSWISH_GET_RELUISH_MULTIPLIER_EXPONENT(x) ((x)->reluishMultiplierExponent) +#define XAI_CNN_HARDSWISH_SET_RELUISH_MULTIPLIER_EXPONENT(x, v) ((x)->reluishMultiplierExponent = (v)) +#define XAI_CNN_HARDSWISH_GET_OUTPUT_MULTIPLIER_FIXED_POINT_S16(x) ((x)->outputMultiplierFixedpointS16) +#define XAI_CNN_HARDSWISH_SET_OUTOUT_MULTIPLIER_FIXED_POINT_S16(x, v) ((x)->outputMultiplierFixedpointS16 = (v)) +#define XAI_CNN_HARDSWISH_GET_OUTPUT_MULTIPLIER_EXPONENT(x) ((x)->outputMultiplierExponent) +#define XAI_CNN_HARDSWISH_SET_OUTPUT_MULTIPLIER_EXPONENT(x, v) ((x)->outputMultiplierExponent = (v)) + +typedef struct +{ + int8_t quantization_mode; + // tfl related parameters + int32_t inputRangeRadius; + int32_t inputScale; + int32_t inputShift; + int32_t inputZeroPoint; +} xai_cnn_sigmoid_params; + +#define XAI_CNN_SIGMOID_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_SIGMOID_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_SIGMOID_GET_INPUT_RANGE_RADIUS(x) ((x)->inputRangeRadius) +#define XAI_CNN_SIGMOID_SET_INPUT_RANGE_RADIUS(x, v) ((x)->inputRangeRadius = (v)) +#define XAI_CNN_SIGMOID_GET_INPUT_SCALE(x) ((x)->inputScale) +#define XAI_CNN_SIGMOID_SET_INPUT_SCALE(x, v) ((x)->inputScale = (v)) +#define XAI_CNN_SIGMOID_GET_INPUT_SHIFT(x) ((x)->inputShift) +#define XAI_CNN_SIGMOID_SET_INPUT_SHIFT(x, v) ((x)->inputShift = (v)) +#define XAI_CNN_SIGMOID_GET_INPUT_ZERO_POINT(x) ((x)->inputZeroPoint) +#define XAI_CNN_SIGMOID_SET_INPUT_ZERO_POINT(x, v) ((x)->inputZeroPoint = (v)) + +typedef struct +{ + int8_t quantization_mode; + // tfl related parameters + int32_t inputRangeRadius; + int32_t inputScale; + int32_t inputShift; + int32_t inputZeroPoint; + int32_t outputZeroPoint; //Hack in Glow to keep tanh and sigmoid params different +} xai_cnn_tanh_params; + +#define XAI_CNN_TANH_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_TANH_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_TANH_GET_INPUT_RANGE_RADIUS(x) ((x)->inputRangeRadius) +#define XAI_CNN_TANH_SET_INPUT_RANGE_RADIUS(x, v) ((x)->inputRangeRadius = (v)) +#define XAI_CNN_TANH_GET_INPUT_SCALE(x) ((x)->inputScale) +#define XAI_CNN_TANH_SET_INPUT_SCALE(x, v) ((x)->inputScale = (v)) +#define XAI_CNN_TANH_GET_INPUT_SHIFT(x) ((x)->inputShift) +#define XAI_CNN_TANH_SET_INPUT_SHIFT(x, v) ((x)->inputShift = (v)) +#define XAI_CNN_TANH_GET_INPUT_ZERO_POINT(x) ((x)->inputZeroPoint) +#define XAI_CNN_TANH_SET_INPUT_ZERO_POINT(x, v) ((x)->inputZeroPoint = (v)) + + +typedef struct +{ + int32_t outputScaleIdentity; + int32_t outputShiftIdentity; + int32_t outputScaleAlpha; + int32_t outputShiftAlpha; + int32_t inputOffset; + int32_t outputOffset; + int8_t quantization_mode; +} xai_cnn_tfl_leakyrelu_params; + +#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SCALE_IDENTITY(x) ((x)->outputScaleIdentity) +#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SCALE_IDENTITY(x, v) ((x)->outputScaleIdentity = (v)) +#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SHIFT_IDENTITY(x) ((x)->outputShiftIdentity) +#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SHIFT_IDENTITY(x, v) ((x)->outputShiftIdentity = (v)) +#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SCALE_ALPHA(x) ((x)->outputScaleAlpha) +#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SCALE_ALPHA(x, v) ((x)->outputScaleAlpha = (v)) +#define XAI_CNN_LEAKYRELU_GET_OUTPUT_SHIFT_ALPHA(x) ((x)->outputShiftAlpha) +#define XAI_CNN_LEAKYRELU_SET_OUTPUT_SHIFT_ALPHA(x, v) ((x)->outputShiftAlpha = (v)) +#define XAI_CNN_LEAKYRELU_GET_INPUT_OFFSET(x) ((x)->inputOffset) +#define XAI_CNN_LEAKYRELU_SET_INPUT_OFFSET(x, v) ((x)->inputOffset = (v)) +#define XAI_CNN_LEAKYRELU_GET_OUTPUT_OFFSET(x) ((x)->outputOffset) +#define XAI_CNN_LEAKYRELU_SET_OUTPUT_OFFSET(x, v) ((x)->outputOffset = (v)) +#define XAI_CNN_LEAKYRELU_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_LEAKYRELU_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +typedef struct +{ + int32_t outputScalePositive; + int32_t outputScaleNegative; + int32_t outputShiftPositive; + int32_t outputShiftNegative; + int32_t inputOffset; + int32_t outputOffset; + int32_t alphaOffset; + int8_t quantization_mode; +} xai_cnn_tfl_prelu_params; + +#define XAI_CNN_PRELU_GET_OUTPUT_SCALE_POSITIVE(x) ((x)->outputScalePositive) +#define XAI_CNN_PRELU_SET_OUTPUT_SCALE_POSITIVE(x, v) ((x)->outputScalePositive = (v)) +#define XAI_CNN_PRELU_GET_OUTPUT_SHIFT_POSITIVE(x) ((x)->outputShiftPositive) +#define XAI_CNN_PRELU_SET_OUTPUT_SHIFT_POSITIVE(x, v) ((x)->outputShiftPositive = (v)) +#define XAI_CNN_PRELU_GET_OUTPUT_SCALE_NEGATIVE(x) ((x)->outputScaleNegative) +#define XAI_CNN_PRELU_SET_OUTPUT_SCALE_NEGATIVE(x, v) ((x)->outputScaleNegative = (v)) +#define XAI_CNN_PRELU_GET_OUTPUT_SHIFT_NEGATIVE(x) ((x)->outputShiftNegative) +#define XAI_CNN_PRELU_SET_OUTPUT_SHIFT_NEGATIVE(x, v) ((x)->outputShiftNegative = (v)) +#define XAI_CNN_PRELU_GET_INPUT_OFFSET(x) ((x)->inputOffset) +#define XAI_CNN_PRELU_SET_INPUT_OFFSET(x, v) ((x)->inputOffset = (v)) +#define XAI_CNN_PRELU_GET_OUTPUT_OFFSET(x) ((x)->outputOffset) +#define XAI_CNN_PRELU_SET_OUTPUT_OFFSET(x, v) ((x)->outputOffset = (v)) +#define XAI_CNN_PRELU_GET_ALPHA_OFFSET(x) ((x)->alphaOffset) +#define XAI_CNN_PRELU_SET_ALPHA_OFFSET(x, v) ((x)->alphaOffset = (v)) +#define XAI_CNN_PRELU_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_PRELU_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +typedef struct +{ + int32_t axis; // axis along which softmax is to be computed + xb_f16 beta; // multiplication factor +} xai_cnn_softmaxA3D_F16_params; + +#define XAI_CNN_SOFTMAXAF16_PARAMS_GET_AXIS(x) ((x)->axis) +#define XAI_CNN_SOFTMAXAF16_PARAMS_GET_BETA(x) ((x)->beta) +#define XAI_CNN_SOFTMAXAF16_PARAMS_SET_AXIS(x, v) ((x)->axis = (v)) +#define XAI_CNN_SOFTMAXAF16_PARAMS_SET_BETA(x, v) ((x)->beta = (v)) +#endif // #if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +typedef struct +{ + int32_t axis; // axis along which softmax is to be computed + float beta; // multiplication factor +} xai_cnn_softmaxA3D_F32_params; + +#define XAI_CNN_SOFTMAXAF32_PARAMS_GET_AXIS(x) ((x)->axis) +#define XAI_CNN_SOFTMAXAF32_PARAMS_GET_BETA(x) ((x)->beta) +#define XAI_CNN_SOFTMAXAF32_PARAMS_SET_AXIS(x, v) ((x)->axis = (v)) +#define XAI_CNN_SOFTMAXAF32_PARAMS_SET_BETA(x, v) ((x)->beta = (v)) +#endif // #if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + +typedef struct +{ + int16_t maxVal; /* global max value of a 3D tile */ + uint8_t tileFlag; /* tileFlag can take values 0-3. + 0 : neither first not last tile + 1 : first tile + 2 : last tile + 3 : first and last tile. */ +} xai_cnn_maxval_params; + +#define XAI_CNN_MAXVAL_GET_MAXVAL(x) ((x)->maxVal) +#define XAI_CNN_MAXVAL_SET_MAXVAL(x, v) ((x)->maxVal = (v)) +#define XAI_CNN_MAXVAL_GET_TILEFLAG(x) ((x)->tileFlag) +#define XAI_CNN_MAXVAL_SET_TILEFLAG(x, v) ((x)->tileFlag = (v)) + +typedef struct +{ + uint16_t input1Scale; /* Scaling factor for 1st input */ + uint16_t input2Scale; /* Scaling factor for 2nd input */ + uint8_t accumShift; /* Accumulator Shift to bring data to 16b after scaling and addition */ + uint16_t outputScale; /* Scaling factor for Output */ + uint8_t outputShift; /* Shift value to bring the final sum to 8b */ + uint8_t reluFlag; /* Enable/Disable Relu at the output */ + int32_t minVal; /* minimum Value for clamping if reluFlag is set to 1 */ + int32_t maxVal; /* maximum Value for clamping if reluFlag is set to 1 */ + uint8_t stride; /* Stride factor */ + int32_t fixUpInit; /* The fixUp term that is used to incorporte Zero Points*/ + uint8_t sat11; /* Dummy. Not used for xai_cnn_eltwise_params. Used only in xaicnne. Added it for consistency */ +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 reluMinFlt; + xb_f16 reluMaxFlt; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float reluMinFlt32; + float reluMaxFlt32; +#endif +} xai_cnn_eltwise_params; + +typedef struct +{ + int32_t input1Scale; /* Scaling factor for 1st input */ + int32_t input2Scale; /* Scaling factor for 2nd input */ + int32_t input1Shift; /* Shift for 1st input */ + int32_t input2Shift; /* Shift for 2nd input */ + int32_t leftShift; /* Left Shift for both input */ + int32_t outputScale; /* Scaling factor for Output */ + int32_t outputShift; /* Shift value to bring the final sum to 8b */ + int32_t input1Offset; + int32_t input2Offset; + int32_t outputOffset; + uint8_t reluFlag; /* Enable/Disable Relu at the output */ + int32_t minVal; /* minimum Value for clamping if reluFlag is set to 1 */ + int32_t maxVal; /* maximum Value for clamping if reluFlag is set to 1 */ + uint8_t stride; /* Stride factor */ + int8_t quantization_mode; +}xai_cnn_tfl_eltwise_params; + +typedef struct +{ + int16_t input1Scale; /* Scaling factor for 1st input */ + int16_t input2Scale; /* Scaling factor for 2nd input */ + uint8_t accumShift; /* Accumulator Shift to bring data to 16b after scaling and addition */ + uint16_t outputScale; /* Scaling factor for Output */ + uint8_t outputShift; /* Shift value to bring the final sum to 8b */ + uint8_t reluFlag; /* Enable/Disable Relu at the output */ + int32_t minVal; /* minimum Value for clamping if reluFlag is set to 1 */ + int32_t maxVal; /* maximum Value for clamping if reluFlag is set to 1 */ + uint8_t stride; /* Stride factor */ + int32_t fixUpInit; /* The fixUp term that is used to incorporte Zero Points*/ + uint8_t sat11; /* Quantization saturation: 0 - 10 bit; 1 - 11 bit; */ +} xnne_eltwise_params; + +#define XAI_CNN_ELTWISE_GET_INPUT1SCALE(x) ((x)->input1Scale) +#define XAI_CNN_ELTWISE_SET_INPUT1SCALE(x, v) ((x)->input1Scale = (v)) +#define XAI_CNN_ELTWISE_GET_INPUT2SCALE(x) ((x)->input2Scale) +#define XAI_CNN_ELTWISE_SET_INPUT2SCALE(x, v) ((x)->input2Scale = (v)) +#define XAI_CNN_ELTWISE_GET_INPUT1SHIFT(x) ((x)->input1Shift) +#define XAI_CNN_ELTWISE_SET_INPUT1SHIFT(x, v) ((x)->input1Shift = (v)) +#define XAI_CNN_ELTWISE_GET_INPUT2SHIFT(x) ((x)->input2Shift) +#define XAI_CNN_ELTWISE_SET_INPUT2SHIFT(x, v) ((x)->input2Shift = (v)) +#define XAI_CNN_ELTWISE_GET_LEFTSHIFT(x) ((x)->leftShift) +#define XAI_CNN_ELTWISE_SET_LEFTSHIFT(x, v) ((x)->leftShift = (v)) +#define XAI_CNN_ELTWISE_GET_ACCUMSHIFT(x) ((x)->accumShift) +#define XAI_CNN_ELTWISE_SET_ACCUMSHIFT(x, v) ((x)->accumShift = (v)) +#define XAI_CNN_ELTWISE_GET_OUTPUTSCALE(x) ((x)->outputScale) +#define XAI_CNN_ELTWISE_SET_OUTPUTSCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_ELTWISE_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_ELTWISE_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_ELTWISE_GET_INPUT1_OFFSET(x) ((x)->input1Offset) +#define XAI_CNN_ELTWISE_SET_INPUT1_OFFSET(x, v) ((x)->input1Offset = (v)) +#define XAI_CNN_ELTWISE_GET_INPUT2_OFFSET(x) ((x)->input2Offset) +#define XAI_CNN_ELTWISE_SET_INPUT2_OFFSET(x, v) ((x)->input2Offset = (v)) +#define XAI_CNN_ELTWISE_GET_OUTPUT_OFFSET(x) ((x)->outputOffset) +#define XAI_CNN_ELTWISE_SET_OUTPUT_OFFSET(x, v) ((x)->outputOffset = (v)) +#define XAI_CNN_ELTWISE_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_ELTWISE_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#define XAI_CNN_ELTWISE_GET_RELUFLAG(x) ((x)->reluFlag) +#define XAI_CNN_ELTWISE_SET_RELUFLAG(x, v) ((x)->reluFlag = (v)) +#define XAI_CNN_ELTWISE_GET_MIN_VAL(x) ((x)->minVal) +#define XAI_CNN_ELTWISE_SET_MIN_VAL(x, v) ((x)->minVal = (v)) +#define XAI_CNN_ELTWISE_GET_MAX_VAL(x) ((x)->maxVal) +#define XAI_CNN_ELTWISE_SET_MAX_VAL(x, v) ((x)->maxVal = (v)) +#define XAI_CNN_ELTWISE_GET_STRIDE(x) ((x)->stride) +#define XAI_CNN_ELTWISE_SET_STRIDE(x, v) ((x)->stride = (v)) +#define XAI_CNN_ELTWISE_GET_FIXUPINIT(x) ((x)->fixUpInit) +#define XAI_CNN_ELTWISE_SET_FIXUPINIT(x, v) ((x)->fixUpInit = (v)) +#define XAI_CNN_ELTWISE_GET_SAT11(x) ((x)->sat11) +#define XAI_CNN_ELTWISE_SET_SAT11(x, v) ((x)->sat11 = (v)) +#define XAI_CNN_ELTWISE_ADD_STRIDE_J1 (1) +#define XAI_CNN_ELTWISE_ADD_STRIDE_J2 (2) +#define XAI_CNN_ELTWISE_ADD_STRIDE_J1J2 (3) +#define XAI_CNN_ELTWISE_SUB_STRIDE_J1 (1) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_ELTWISE_GET_RELU_MIN_FLT(x) ((x)->reluMinFlt) +#define XAI_CNN_ELTWISE_SET_RELU_MIN_FLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_ELTWISE_GET_RELU_MAX_FLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_ELTWISE_SET_RELU_MAX_FLT(x, v) ((x)->reluMaxFlt = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_ELTWISE_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_ELTWISE_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_ELTWISE_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_ELTWISE_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#endif + +typedef struct +{ + uint16_t inputScale; /* Scaling factor for Input */ + uint8_t inputShift; /* Input Shift to bring data to 16b after scaling */ + int32_t minIdx; /* Minimum value of input. Corresponds to first element of LUT array. */ + int32_t maxIdx; /* Maximum value of input. Corresponds to last element of LUT array. */ + uint8_t tableType; /* Value to describe the type of Table: 0/1/2 - Normal/Symmetric/Asymmetric */ + int32_t lut1Offset; /* Offset of the 0th entry of lut1Array in Full range LUT table(minIdx <= lut1Offset <= maxIdx). */ + int32_t lut2Offset; /* Offset of the 0th entry of lut2Array in Full range LUT table(minIdx <= lut2Offset <= maxIdx). */ +} xai_cnn_lut_params; + +#define XAI_LUT_TYPE_NORMAL 0 +#define XAI_LUT_TYPE_EVENSYMMETRIC 1 +#define XAI_LUT_TYPE_ODDSYMMETRIC 2 + +#define XAI_CNN_LUT_GET_INPUTSCALE(x) ((x)->inputScale) +#define XAI_CNN_LUT_SET_INPUTSCALE(x, v) ((x)->inputScale = (v)) +#define XAI_CNN_LUT_GET_INPUTSHIFT(x) ((x)->inputShift) +#define XAI_CNN_LUT_SET_INPUTSHIFT(x, v) ((x)->inputShift = (v)) +#define XAI_CNN_LUT_GET_MIN_IDX(x) ((x)->minIdx) +#define XAI_CNN_LUT_SET_MIN_IDX(x, v) ((x)->minIdx = (v)) +#define XAI_CNN_LUT_GET_MAX_IDX(x) ((x)->maxIdx) +#define XAI_CNN_LUT_SET_MAX_IDX(x, v) ((x)->maxIdx = (v)) +#define XAI_CNN_LUT_GET_TABLE_TYPE(x) ((x)->tableType) +#define XAI_CNN_LUT_SET_TABLE_TYPE(x, v) ((x)->tableType = (v)) +#define XAI_CNN_LUT_GET_LUT1_OFFSET(x) ((x)->lut1Offset) +#define XAI_CNN_LUT_SET_LUT1_OFFSET(x, v) ((x)->lut1Offset = v) +#define XAI_CNN_LUT_GET_LUT2_OFFSET(x) ((x)->lut2Offset) +#define XAI_CNN_LUT_SET_LUT2_OFFSET(x, v) ((x)->lut2Offset = v) + +typedef struct +{ + int16_t outputScale; /* Scaling factor for Output */ + uint8_t outputShift; /* Shift value to bring the final product to output datatype */ + uint8_t reluFlag; /* Enable/Disable Relu at the output */ + int32_t minVal; /* minimum Value for clamping */ + int32_t maxVal; /* maximum Value for clamping */ + int32_t inZero1; + int32_t inZero2; + int32_t fixUpInit; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 reluMinFlt; + xb_f16 reluMaxFlt; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float reluMinFlt32; + float reluMaxFlt32; +#endif +} xai_cnn_eltwiseMul_params; + +#define XAI_CNN_ELTWISE_MUL_GET_OUTPUTSCALE(x) ((x)->outputScale) +#define XAI_CNN_ELTWISE_MUL_SET_OUTPUTSCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_ELTWISE_MUL_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_RELUFLAG(x) ((x)->reluFlag) +#define XAI_CNN_ELTWISE_MUL_SET_RELUFLAG(x, v) ((x)->reluFlag = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_MIN_VAL(x) ((x)->minVal) +#define XAI_CNN_ELTWISE_MUL_SET_MIN_VAL(x, v) ((x)->minVal = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_MAX_VAL(x) ((x)->maxVal) +#define XAI_CNN_ELTWISE_MUL_SET_MAX_VAL(x, v) ((x)->maxVal = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_INZERO_1(x) ((x)->inZero1) +#define XAI_CNN_ELTWISE_MUL_SET_INZERO_1(x, v) ((x)->inZero1 = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_INZERO_2(x) ((x)->inZero2) +#define XAI_CNN_ELTWISE_MUL_SET_INZERO_2(x, v) ((x)->inZero2 = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_FIXUPINIT(x) ((x)->fixUpInit) +#define XAI_CNN_ELTWISE_MUL_SET_FIXUPINIT(x, v) ((x)->fixUpInit = (v)) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_ELTWISE_MUL_GET_RELU_MIN_FLT(x) ((x)->reluMinFlt) +#define XAI_CNN_ELTWISE_MUL_SET_RELU_MIN_FLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_RELU_MAX_FLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_ELTWISE_MUL_SET_RELU_MAX_FLT(x, v) ((x)->reluMaxFlt = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_ELTWISE_MUL_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_ELTWISE_MUL_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_ELTWISE_MUL_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_ELTWISE_MUL_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#endif + +/*SVDF structure */ +typedef struct +{ + int32_t nInput; + int32_t nFilter; + int32_t nMemory; + int32_t nBatch; + int32_t nRank; + int32_t biasFlag; + uint8_t shift1; + uint8_t shift2; + uint8_t accShift1; + uint8_t accShift2; + int32_t preset; + int32_t minVal; + int32_t maxVal; + uint8_t reluFlag; +} xai_cnn_svdf_params; + +#define S24_MIN (-(((int32_t) 1) << 23)) +#define S24_MAX ((((int32_t) 1) << 23) - 1) +#define XCHAL_IVPN_SIMD_WIDTH_2 (XCHAL_IVPN_SIMD_WIDTH >> 1) +#define USE_24_BIT_ACCUMULATOR +#define MULQISA 1 + +#define XAI_CNN_SVDF_GET_NUMINPUT(x) ((x)->nInput) +#define XAI_CNN_SVDF_SET_NUMINPUT(x, v) ((x)->nInput = (v)) +#define XAI_CNN_SVDF_GET_MIN_VAL(x) ((x)->minVal) +#define XAI_CNN_SVDF_SET_MIN_VAL(x, v) ((x)->minVal = (v)) +#define XAI_CNN_SVDF_GET_MAX_VAL(x) ((x)->maxVal) +#define XAI_CNN_SVDF_SET_MAX_VAL(x, v) ((x)->maxVal = (v)) +#define XAI_CNN_SVDF_GET_RELUFLAG(x) ((x)->reluFlag) +#define XAI_CNN_SVDF_SET_RELUFLAG(x, v) ((x)->reluFlag = (v)) +#define XAI_CNN_SVDF_GET_NUMFILTER(x) ((x)->nFilter) +#define XAI_CNN_SVDF_SET_NUMFILTER(x, v) ((x)->nFilter = (v)) +#define XAI_CNN_SVDF_GET_NUMMEMORY(x) ((x)->nMemory) +#define XAI_CNN_SVDF_SET_NUMMEMORY(x, v) ((x)->nMemory = (v)) +#define XAI_CNN_SVDF_GET_NUMBATCH(x) ((x)->nBatch) +#define XAI_CNN_SVDF_SET_NUMBATCH(x, v) ((x)->nBatch = (v)) +#define XAI_CNN_SVDF_GET_BIASFLAG(x) ((x)->biasFlag) +#define XAI_CNN_SVDF_SET_BIASFLAG(x, v) ((x)->biasFlag = (v)) +#define XAI_CNN_SVDF_GET_RANK(x) ((x)->nRank) +#define XAI_CNN_SVDF_SET_RANK(x, v) ((x)->nRank = (v)) +#define XAI_CNN_SVDF_GET_NUNIT(x) ((x)->nUnit +#define XAI_CNN_SVDF_SET_NUNIT(x, v) ((x)->nUnit = (v)) +#define XAI_CNN_SVDF_GET_OUTPUTSHIFT1(x) ((x)->shift1) +#define XAI_CNN_SVDF_SET_OUTPUTSHIFT1(x, v) ((x)->shift1 = (v)) +#define XAI_CNN_SVDF_GET_OUTPUTSHIFT2(x) ((x)->shift2) +#define XAI_CNN_SVDF_SET_OUTPUTSHIFT2(x, v) ((x)->shift2 = (v)) +#define XAI_CNN_SVDF_GET_ACCSHIFT1(x) ((x)->accShift1) +#define XAI_CNN_SVDF_SET_ACCSHIFT1(x, v) ((x)->accShift1 = (v)) +#define XAI_CNN_SVDF_GET_ACCSHIFT2(x) ((x)->accShift2) +#define XAI_CNN_SVDF_SET_ACCSHIFT2(x, v) ((x)->accShift2 = (v)) +#define XAI_CNN_SVDF_GET_PRESET(x) ((x)->preset) +#define XAI_CNN_SVDF_SET_PRESET(x, v) ((x)->preset = (v)) + +typedef struct +{ + uint16_t tableLength0; /* Minor table (Table 0) length */ + uint16_t tableLength1; /* Major table (Table 1) length */ + uint16_t inMask0; /* Mask applied on input while accessing minor table entry */ + uint16_t inMask1; /* Mask applied on input while accessing major table entry */ + uint8_t inShift0; /* Shift applied on input while accessing minor table entry */ + uint8_t inShift1; /* Shift applied on input while accessing major table entry */ + uint8_t outputShift; /* No. of output bits to be right shifted. */ +} xai_cnn_exponent_params; + +#define XAI_CNN_EXPONENT_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_EXPONENT_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_EXPONENT_GET_TABLELENGTH_0(x) ((x)->tableLength0) +#define XAI_CNN_EXPONENT_SET_TABLELENGTH_0(x, v) ((x)->tableLength0 = (v)) +#define XAI_CNN_EXPONENT_GET_TABLELENGTH_1(x) ((x)->tableLength1) +#define XAI_CNN_EXPONENT_SET_TABLELENGTH_1(x, v) ((x)->tableLength1 = (v)) +#define XAI_CNN_EXPONENT_GET_MASK_0(x) ((x)->inMask0) +#define XAI_CNN_EXPONENT_SET_MASK_0(x, v) ((x)->inMask0 = (v)) +#define XAI_CNN_EXPONENT_GET_MASK_1(x) ((x)->inMask1) +#define XAI_CNN_EXPONENT_SET_MASK_1(x, v) ((x)->inMask1 = (v)) +#define XAI_CNN_EXPONENT_GET_SHIFT_0(x) ((x)->inShift0) +#define XAI_CNN_EXPONENT_SET_SHIFT_0(x, v) ((x)->inShift0 = (v)) +#define XAI_CNN_EXPONENT_GET_SHIFT_1(x) ((x)->inShift1) +#define XAI_CNN_EXPONENT_SET_SHIFT_1(x, v) ((x)->inShift1 = (v)) + +typedef struct +{ + uint8_t stride; /* Stride factor */ + uint8_t reverse; /* Flag to indicate direction of reorg */ +} xai_cnn_reorg_params; + +#define XAI_CNN_REORG_GET_STRIDE(x) ((x)->stride) +#define XAI_CNN_REORG_SET_STRIDE(x, v) ((x)->stride = (v)) +#define XAI_CNN_REORG_GET_REVERSE(x) ((x)->reverse) +#define XAI_CNN_REORG_SET_REVERSE(x, v) ((x)->reverse = (v)) + +typedef struct +{ + uint8_t strideX; /* StrideX factor */ + uint8_t strideY; /* StrideY factor */ + uint8_t reverse; /* Flag to indicate direction of reorg */ +} xai_cnn_reorg4D_params; + +#define XAI_CNN_REORG4D_GET_STRIDEX(x) ((x)->strideX) +#define XAI_CNN_REORG4D_SET_STRIDEX(x, v) ((x)->strideX = (v)) +#define XAI_CNN_REORG4D_GET_STRIDEY(x) ((x)->strideY) +#define XAI_CNN_REORG4D_SET_STRIDEY(x, v) ((x)->strideY = (v)) +#define XAI_CNN_REORG4D_GET_REVERSE(x) ((x)->reverse) +#define XAI_CNN_REORG4D_SET_REVERSE(x, v) ((x)->reverse = (v)) + +typedef struct +{ + uint8_t order1; /* inTile dimension which will be transposed into dimension 1 of outTile */ + uint8_t order2; /* inTile dimension which will be transposed into dimension 2 of outTile */ + uint8_t order3; /* inTile dimension which will be transposed into dimension 3 of outTile */ + uint8_t order4; /* inTile dimension which will be transposed into dimension 4 of outTile */ +}xai_cnn_permute4D_params; + +#define XAI_CNN_PERMUTE4D_GET_ORDER1(x) ((x)->order1) +#define XAI_CNN_PERMUTE4D_SET_ORDER1(x, v) ((x)->order1 = (v)) +#define XAI_CNN_PERMUTE4D_GET_ORDER2(x) ((x)->order2) +#define XAI_CNN_PERMUTE4D_SET_ORDER2(x, v) ((x)->order2 = (v)) +#define XAI_CNN_PERMUTE4D_GET_ORDER3(x) ((x)->order3) +#define XAI_CNN_PERMUTE4D_SET_ORDER3(x, v) ((x)->order3 = (v)) +#define XAI_CNN_PERMUTE4D_GET_ORDER4(x) ((x)->order4) +#define XAI_CNN_PERMUTE4D_SET_ORDER4(x, v) ((x)->order4 = (v)) + +typedef struct +{ + uint32_t groups; /* Input Groups */ +} xai_cnn_shuffle3D_params; + +#define XAI_CNN_SHUFFLE_GET_INTERLEAVEGROUPS(x) ((x)->groups) +#define XAI_CNN_SHUFFLE_SET_INTERLEAVEGROUPS(x, v) ((x)->groups = (v)) + +typedef struct +{ + int32_t xscale; //Q13.18 format in xaicnn and Q21.10 format in TFL + int32_t yscale; //Q13.18 format in xaicnn and Q21.10 format in TFL + int32_t xshift; //Q13.18 format in xaicnn and Q21.10 format in TFL + int32_t yshift; //Q13.18 format in xaicnn and Q21.10 format in TFL + uint8_t extrapolationFlag; + int32_t extrapolationValue; + int32_t inputFrameWidth; + int32_t inputFrameHeight; + int8_t alignCorners; + int8_t halfPixelCenters; + float xscaleFlt; + float yscaleFlt; + float xshiftFlt; + float yshiftFlt; + int8_t quantization_mode; +} xai_cnn_interp3D_params; + +#define XAI_CNN_INTERP3D_GET_XSCALE(x) ((x)->xscale) +#define XAI_CNN_INTERP3D_GET_YSCALE(x) ((x)->yscale) +#define XAI_CNN_INTERP3D_GET_XSHIFT(x) ((x)->xshift) +#define XAI_CNN_INTERP3D_GET_YSHIFT(x) ((x)->yshift) +#define XAI_CNN_INTERP3D_GET_EXTRAPOLATION_FLAG(x) ((x)->extrapolationFlag) +#define XAI_CNN_INTERP3D_GET_EXTRAPOLATION_VALUE(x) ((x)->extrapolationValue) +#define XAI_CNN_INTERP3D_GET_FRAME_WIDTH(x) ((x)->inputFrameWidth) +#define XAI_CNN_INTERP3D_GET_FRAME_HEIGHT(x) ((x)->inputFrameHeight) +#define XAI_CNN_INTERP3D_GET_FLAG_ALIGN_CORNERS(x) ((x)->alignCorners) +#define XAI_CNN_INTERP3D_GET_FLAG_HALF_PIXEL_CENTERS(x) ((x)->halfPixelCenters) +#define XAI_CNN_INTERP3D_GET_XSCALE_FLT(x) ((x)->xscaleFlt) +#define XAI_CNN_INTERP3D_GET_YSCALE_FLT(x) ((x)->yscaleFlt) +#define XAI_CNN_INTERP3D_GET_XSHIFT_FLT(x) ((x)->xshiftFlt) +#define XAI_CNN_INTERP3D_GET_YSHIFT_FLT(x) ((x)->yshiftFlt) + +#define XAI_CNN_INTERP3D_SET_XSCALE(x, v) ((x)->xscale = (v)) +#define XAI_CNN_INTERP3D_SET_YSCALE(x, v) ((x)->yscale = (v)) +#define XAI_CNN_INTERP3D_SET_XSHIFT(x, v) ((x)->xshift = (v)) +#define XAI_CNN_INTERP3D_SET_YSHIFT(x, v) ((x)->yshift = (v)) +#define XAI_CNN_INTERP3D_SET_EXTRAPOLATION_FLAG(x, v) ((x)->extrapolationFlag = (v)) +#define XAI_CNN_INTERP3D_SET_EXTRAPOLATION_VALUE(x, v) ((x)->extrapolationValue = (v)) +#define XAI_CNN_INTERP3D_SET_FRAME_WIDTH(x, v) ((x)->inputFrameWidth = (v)) +#define XAI_CNN_INTERP3D_SET_FRAME_HEIGHT(x, v) ((x)->inputFrameHeight = (v)) +#define XAI_CNN_INTERP3D_SET_FLAG_ALIGN_CORNERS(x, v) ((x)->alignCorners = v) +#define XAI_CNN_INTERP3D_SET_FLAG_HALF_PIXEL_CENTERS(x, v) ((x)->halfPixelCenters = v) +#define XAI_CNN_INTERP3D_SET_XSCALE_FLT(x, v) ((x)->xscaleFlt = (v)) +#define XAI_CNN_INTERP3D_SET_YSCALE_FLT(x, v) ((x)->yscaleFlt = (v)) +#define XAI_CNN_INTERP3D_SET_XSHIFT_FLT(x, v) ((x)->xshiftFlt = (v)) +#define XAI_CNN_INTERP3D_SET_YSHIFT_FLT(x, v) ((x)->yshiftFlt = (v)) +#define XAI_CNN_INTERP3D_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_INTERP3D_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +typedef struct +{ + int32_t xscale; //Q13.18 format + int32_t yscale; //Q13.18 format + int32_t xshift; //Q13.18 format + int32_t yshift; //Q13.18 format + int32_t inputFrameWidth; + int32_t inputFrameHeight; + int8_t alignCorners; + int8_t halfPixelCenters; + float xscaleFlt; + float yscaleFlt; + float xshiftFlt; + float yshiftFlt; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 xscaleFlt16; + xb_f16 yscaleFlt16; +#endif + int8_t quantization_mode; +} xai_cnn_resize_nearest3D_params; + +#define XAI_CNN_RESIZENEAREST3D_GET_XSCALE(x) ((x)->xscale) +#define XAI_CNN_RESIZENEAREST3D_GET_YSCALE(x) ((x)->yscale) +#define XAI_CNN_RESIZENEAREST3D_GET_XSHIFT(x) ((x)->xshift) +#define XAI_CNN_RESIZENEAREST3D_GET_YSHIFT(x) ((x)->yshift) +#define XAI_CNN_RESIZENEAREST3D_GET_FLAG_ALIGN_CORNERS(x) ((x)->alignCorners) +#define XAI_CNN_RESIZENEAREST3D_GET_FLAG_HALF_PIXEL_CENTERS(x) ((x)->halfPixelCenters) +#define XAI_CNN_RESIZENEAREST3D_GET_FRAME_WIDTH(x) ((x)->inputFrameWidth) +#define XAI_CNN_RESIZENEAREST3D_GET_FRAME_HEIGHT(x) ((x)->inputFrameHeight) +#define XAI_CNN_RESIZENEAREST3D_GET_XSCALE_FLT(x) ((x)->xscaleFlt) +#define XAI_CNN_RESIZENEAREST3D_GET_YSCALE_FLT(x) ((x)->yscaleFlt) +#define XAI_CNN_RESIZENEAREST3D_GET_XSHIFT_FLT(x) ((x)->xshiftFlt) +#define XAI_CNN_RESIZENEAREST3D_GET_YSHIFT_FLT(x) ((x)->yshiftFlt) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_RESIZENEAREST3D_GET_XSCALE_FLT16(x) ((x)->xscaleFlt16) +#define XAI_CNN_RESIZENEAREST3D_GET_YSCALE_FLT16(x) ((x)->yscaleFlt16) +#endif + +#define XAI_CNN_RESIZENEAREST3D_SET_XSCALE(x, v) ((x)->xscale = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_YSCALE(x, v) ((x)->yscale = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_XSHIFT(x, v) ((x)->xshift = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_YSHIFT(x, v) ((x)->yshift = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_FLAG_ALIGN_CORNERS(x, v) ((x)->alignCorners = v) +#define XAI_CNN_RESIZENEAREST3D_SET_FLAG_HALF_PIXEL_CENTERS(x, v) ((x)->halfPixelCenters = v) +#define XAI_CNN_RESIZENEAREST3D_SET_FRAME_WIDTH(x, v) ((x)->inputFrameWidth = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_FRAME_HEIGHT(x, v) ((x)->inputFrameHeight = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_XSCALE_FLT(x, v) ((x)->xscaleFlt = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_YSCALE_FLT(x, v) ((x)->yscaleFlt = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_XSHIFT_FLT(x, v) ((x)->xshiftFlt = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_YSHIFT_FLT(x, v) ((x)->yshiftFlt = (v)) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_RESIZENEAREST3D_SET_XSCALE_FLT16(x, v) ((x)->xscaleFlt16 = (v)) +#define XAI_CNN_RESIZENEAREST3D_SET_YSCALE_FLT16(x, v) ((x)->yscaleFlt16 = (v)) +#endif +#define XAI_CNN_RESIZENEAREST3D_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_RESIZENEAREST3D_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + + +typedef struct +{ + int16_t epsilon; // Always added or max val is considered based on tileFlag. + uint8_t normType; // (1= L1 Norm, 2 = L2 Norm) + uint8_t normAxis; // indicates the combination of axes along which to normalize + uint8_t channelShareFlag; // indicates whether we have a single scale value or an array equal to number of channels + uint8_t tileFlag; // indicates whether the given tile is a first tile, last tile or neither of those + uint8_t tensorFlowFlag; // describes the usage of epsilon + int8_t quantScaleTableShift; // shift value for scalar table + int8_t rSqrtTableShift; // shift value for recip square root table + int8_t recipTableShift; // shift value for recip table + int8_t rSqrtIndexShift; // shift value recip-square-root table index + int8_t sumSquareShift; // shift value for sum of squares + float epsilonFlt; // floating point epsilon to be added to avoid divide by zero + float sumSqScaleFlt; // floating point scale value to be multiplied to sum of squares, to account for divide by N factor + int8_t quantization_mode; +} xai_cnn_normalize3D_params; + +#define XAI_CNN_NORMALIZE3D_GET_EPSILON(x) ((x)->epsilon) +#define XAI_CNN_NORMALIZE3D_SET_EPSILON(x, v) ((x)->epsilon = (v)) +#define XAI_CNN_NORMALIZE3D_GET_NORM_TYPE(x) ((x)->normType) +#define XAI_CNN_NORMALIZE3D_SET_NORM_TYPE(x, v) ((x)->normType = (v)) +#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_WIDTH(x) ((x)->normAxis & CNN_NORMALIZE_ALONG_WIDTH) +#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_WIDTH(x) ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_WIDTH)) +#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_WIDTH(x) ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_WIDTH)) +#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_HEIGHT(x) ((x)->normAxis & CNN_NORMALIZE_ALONG_HEIGHT) +#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_HEIGHT(x) ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_HEIGHT)) +#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_HEIGHT(x) ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_HEIGHT)) +#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_WIDTH_AND_HEIGHT(x) ((x)->normAxis & CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT) +#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_WIDTH_AND_HEIGHT(x) ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT)) +#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_WIDTH_AND_HEIGHT(x) ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT)) +#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_DEPTH(x) ((x)->normAxis & CNN_NORMALIZE_ALONG_DEPTH) +#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_DEPTH(x) ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_DEPTH)) +#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_DEPTH(x) ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_DEPTH)) +#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_BATCH(x) ((x)->normAxis & CNN_NORMALIZE_ALONG_BATCH) +#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_BATCH(x) ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_BATCH)) +#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_BATCH(x) ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_BATCH)) +#define XAI_CNN_NORMALIZE3D_GET_NORMALIZE_ALONG_WIDTH_HEIGHT_DEPTH(x) ((x)->normAxis & CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH) +#define XAI_CNN_NORMALIZE3D_SET_NORMALIZE_ALONG_WIDTH_HEIGHT_DEPTH(x) ((x)->normAxis = ((x)->normAxis | CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH)) +#define XAI_CNN_NORMALIZE3D_RESET_NORMALIZE_ALONG_WIDTH_HEIGHT_DEPTH(x) ((x)->normAxis = ((x)->normAxis & ~CNN_NORMALIZE_ALONG_WIDTH_AND_HEIGHT_AND_DEPTH)) +#define XAI_CNN_NORMALIZE3D_GET_CHANNEL_SHARE_FLAG(x) ((x)->channelShareFlag & CNN_NORMALIZE_CHANNEL_SHARE_FLAG) +#define XAI_CNN_NORMALIZE3D_SET_CHANNEL_SHARE_FLAG(x) ((x)->channelShareFlag = ((x)->channelShareFlag | CNN_NORMALIZE_CHANNEL_SHARE_FLAG)) +#define XAI_CNN_NORMALIZE3D_RESET_CHANNEL_SHARE_FLAG(x) ((x)->channelShareFlag = ((x)->channelShareFlag & ~CNN_NORMALIZE_CHANNEL_SHARE_FLAG)) +#define XAI_CNN_NORMALIZE3D_GET_TILE_FLAG(x) ((x)->tileFlag) +#define XAI_CNN_NORMALIZE3D_SET_TILE_FLAG(x, v) ((x)->tileFlag = (v)) +#define XAI_CNN_NORMALIZE3D_GET_TENSORFLOW_FLAG(x) ((x)->tensorFlowFlag) +#define XAI_CNN_NORMALIZE3D_SET_TENSORFLOW_FLAG(x, v) ((x)->tensorFlowFlag = (v)) +#define XAI_CNN_NORMALIZE3D_GET_RSQRT_TABLE_SHIFT(x) ((x)->rSqrtTableShift) +#define XAI_CNN_NORMALIZE3D_SET_RSQRT_TABLE_SHIFT(x, v) ((x)->rSqrtTableShift = (v)) +#define XAI_CNN_NORMALIZE3D_GET_RECIP_TABLE_SHIFT(x) ((x)->recipTableShift) +#define XAI_CNN_NORMALIZE3D_SET_RECIP_TABLE_SHIFT(x, v) ((x)->recipTableShift = (v)) +#define XAI_CNN_NORMALIZE3D_GET_RSQRT_INDEX_SHIFT(x) ((x)->rSqrtIndexShift) +#define XAI_CNN_NORMALIZE3D_SET_RSQRT_INDEX_SHIFT(x, v) ((x)->rSqrtIndexShift = (v)) +#define XAI_CNN_NORMALIZE3D_GET_SUM_SQUARE_SHIFT(x) ((x)->sumSquareShift) +#define XAI_CNN_NORMALIZE3D_SET_SUM_SQUARE_SHIFT(x, v) ((x)->sumSquareShift = (v)) +#define XAI_CNN_NORMALIZE3D_GET_QUANT_SCALE_TABLE_SHIFT(x) ((x)->quantScaleTableShift) +#define XAI_CNN_NORMALIZE3D_SET_QUANT_SCALE_TABLE_SHIFT(x, v) ((x)->quantScaleTableShift = (v)) +#define XAI_CNN_NORMALIZE3D_GET_EPSILON_FLT(x) ((x)->epsilonFlt) +#define XAI_CNN_NORMALIZE3D_SET_EPSILON_FLT(x, v) ((x)->epsilonFlt = (v)) +#define XAI_CNN_NORMALIZE3D_GET_SUM_SQ_SCALE_FLT(x) ((x)->sumSqScaleFlt) +#define XAI_CNN_NORMALIZE3D_SET_SUM_SQ_SCALE_FLT(x, v) ((x)->sumSqScaleFlt = (v)) +#define XAI_CNN_NORMALIZE3D_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_NORMALIZE3D_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +typedef struct +{ + uint8_t outputShift; /* Shift value to bring the final value to 8b */ + uint8_t tileFlag; + uint8_t meanShift; /* set to a S to do the division */ + uint8_t sqAccShift; /* set to a shift value of accumulation of squares to 32 bits*/ + int32_t meanScale; /*Scale = (1<outputShift) +#define XAI_CNN_INSTANCE_NORM_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_TILEFLAG(x) ((x)->tileFlag) +#define XAI_CNN_INSTANCE_NORM_SET_TILEFLAG(x, v) ((x)->tileFlag = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE(x) ((x)->meanScale) +#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE(x, v) ((x)->meanScale = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MEANSHIFT(x) ((x)->meanShift) +#define XAI_CNN_INSTANCE_NORM_SET_MEANSHIFT(x, v) ((x)->meanShift = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_RELUFLAG(x) ((x)->reluFlag) +#define XAI_CNN_INSTANCE_NORM_SET_RELUFLAG(x, v) ((x)->reluFlag = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MIN_VAL(x) ((x)->minVal) +#define XAI_CNN_INSTANCE_NORM_SET_MIN_VAL(x, v) ((x)->minVal = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MAX_VAL(x) ((x)->maxVal) +#define XAI_CNN_INSTANCE_NORM_SET_MAX_VAL(x, v) ((x)->maxVal = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_SQACCSHIFT(x) ((x)->sqAccShift) +#define XAI_CNN_INSTANCE_NORM_SET_SQACCSHIFT(x, v) ((x)->sqAccShift = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_AXIS(x) ((x)->axis) +#define XAI_CNN_INSTANCE_NORM_SET_AXIS(x, v) ((x)->axis = (v)) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_INSTANCE_NORM_GET_RELU_MIN_FLT(x) ((x)->reluMinFlt) +#define XAI_CNN_INSTANCE_NORM_SET_RELU_MIN_FLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_RELU_MAX_FLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_INSTANCE_NORM_SET_RELU_MAX_FLT(x, v) ((x)->reluMaxFlt = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_EPSILON_FLT(x) ((x)->epsilon) +#define XAI_CNN_INSTANCE_NORM_SET_EPSILON_FLT(x, v) ((x)->epsilon = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT(x) ((x)->meanScaleFlt) +#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT(x, v) ((x)->meanScaleFlt = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT(x) ((x)->meanScaleFlt) +#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT(x, v) ((x)->meanScaleFlt = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_INSTANCE_NORM_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_INSTANCE_NORM_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_INSTANCE_NORM_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_EPSILON_FLT32(x) ((x)->epsilonFlt32) +#define XAI_CNN_INSTANCE_NORM_SET_EPSILON_FLT32(x, v) ((x)->epsilonFlt32 = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT32(x) ((x)->meanScaleFlt32) +#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT32(x, v) ((x)->meanScaleFlt32 = (v)) +#define XAI_CNN_INSTANCE_NORM_GET_MEANSCALE_FLT32(x) ((x)->meanScaleFlt32) +#define XAI_CNN_INSTANCE_NORM_SET_MEANSCALE_FLT32(x, v) ((x)->meanScaleFlt32 = (v)) +#endif + +typedef struct +{ + uint32_t valueR; /* constant value which needs to be divided with divisor + for each channel , can take a maximum range of (2^15) - 1 + for I8 input and 2^31-1 for S16 input */ + uint8_t outShift; /* Shift value applied to scaled output */ +} xai_cnn_divide3D_params; + +#define XAI_CNN_CHANNELWISE_DIVIDE_GET_VALUE_R(x) ((x)->valueR) +#define XAI_CNN_CHANNELWISE_DIVIDE_SET_VALUE_R(x, v) ((x)->valueR = (v)) +#define XAI_CNN_CHANNELWISE_DIVIDE_GET_OUT_SHIFT(x) ((x)->outShift) +#define XAI_CNN_CHANNELWISE_DIVIDE_SET_OUT_SHIFT(x, v) ((x)->outShift = (v)) + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + +#define CNNA_CONV_F16_FLAG_RELU 1 +#define CNNA_CONV_F16_FLAG_LEFTEDGE (1 << 1) +#define CNNA_CONV_F16_FLAG_TOPEDGE (1 << 2) +#endif // #if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +typedef struct +{ + uint16_t spatialScaleShiftX; /* Shift value to apply for spatial scale operations in the X direction */ + uint16_t spatialScaleShiftY; /* Shift value to apply for spatial scale operations in the Y direction */ + uint16_t outShift; /* Is either 7, 8, 15, 16, or 23 depending on the datatype of input */ + int32_t extrapolationValue; /* Extrapolate value to be used during extrapolation */ + int32_t roiStride; /* ROI coordinates' stride */ + uint8_t tensorFlowFlag; /* Flag to change box coordinates ordering from Caffe2 to TensorFlow */ +} xai_cnn_cropResize3D_params; + +#define XAI_CNN_CROP_RESIZE3D_GET_SPATIAL_SCALE_SHIFTX(x) ((x)->spatialScaleShiftX) +#define XAI_CNN_CROP_RESIZE3D_SET_SPATIAL_SCALE_SHIFTX(x, v) ((x)->spatialScaleShiftX = (v)) +#define XAI_CNN_CROP_RESIZE3D_GET_SPATIAL_SCALE_SHIFTY(x) ((x)->spatialScaleShiftY) +#define XAI_CNN_CROP_RESIZE3D_SET_SPATIAL_SCALE_SHIFTY(x, v) ((x)->spatialScaleShiftY = (v)) +#define XAI_CNN_CROP_RESIZE3D_GET_OUT_SHIFT(x) ((x)->outShift) +#define XAI_CNN_CROP_RESIZE3D_SET_OUT_SHIFT(x, v) ((x)->outShift = (v)) +#define XAI_CNN_CROP_RESIZE3D_GET_EXTRAPOLATION_VALUE(x) ((x)->extrapolationValue) +#define XAI_CNN_CROP_RESIZE3D_SET_EXTRAPOLATION_VALUE(x, v) ((x)->extrapolationValue = (v)) +#define XAI_CNN_CROP_RESIZE3D_GET_ROI_STRIDE(x) ((x)->roiStride) +#define XAI_CNN_CROP_RESIZE3D_SET_ROI_STRIDE(x, v) ((x)->roiStride = (v)) +#define XAI_CNN_CROP_RESIZE3D_GET_TENSORFLOW_FLAG(x) ((x)->tensorFlowFlag) +#define XAI_CNN_CROP_RESIZE3D_SET_TENSORFLOW_FLAG(x, v) ((x)->tensorFlowFlag = (v)) + +typedef struct +{ + uint8_t outputShift; /* Shift value to bring the final value to 8b */ + uint8_t reluFlag; /* Enable/Disable Relu at the output */ + int32_t minVal; /* minimum Value for clamping if reluFlag is set to 1 */ + int32_t maxVal; /* maximum Value for clamping if reluFlag is set to 1 */ +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 reluMinFlt; + xb_f16 reluMaxFlt; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float reluMinFlt32; + float reluMaxFlt32; +#endif +} xai_cnn_batchnorm_params; + +#define XAI_CNN_BATCHNORM_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_BATCHNORM_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_BATCHNORM_GET_RELUFLAG(x) ((x)->reluFlag) +#define XAI_CNN_BATCHNORM_SET_RELUFLAG(x, v) ((x)->reluFlag = (v)) +#define XAI_CNN_BATCHNORM_GET_MIN_VAL(x) ((x)->minVal) +#define XAI_CNN_BATCHNORM_SET_MIN_VAL(x, v) ((x)->minVal = (v)) +#define XAI_CNN_BATCHNORM_GET_MAX_VAL(x) ((x)->maxVal) +#define XAI_CNN_BATCHNORM_SET_MAX_VAL(x, v) ((x)->maxVal = (v)) +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_BATCHNORM_GET_RELU_MIN_FLT(x) ((x)->reluMinFlt) +#define XAI_CNN_BATCHNORM_SET_RELU_MIN_FLT(x, v) ((x)->reluMinFlt = (v)) +#define XAI_CNN_BATCHNORM_GET_RELU_MAX_FLT(x) ((x)->reluMaxFlt) +#define XAI_CNN_BATCHNORM_SET_RELU_MAX_FLT(x, v) ((x)->reluMaxFlt = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_BATCHNORM_GET_RELU_MIN_FLT32(x) ((x)->reluMinFlt32) +#define XAI_CNN_BATCHNORM_SET_RELU_MIN_FLT32(x, v) ((x)->reluMinFlt32 = (v)) +#define XAI_CNN_BATCHNORM_GET_RELU_MAX_FLT32(x) ((x)->reluMaxFlt32) +#define XAI_CNN_BATCHNORM_SET_RELU_MAX_FLT32(x, v) ((x)->reluMaxFlt32 = (v)) +#endif + +typedef struct +{ +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + xb_f16 lambdaF16; + xb_f16 alphaF16; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) + float lambdaF32; + float alphaF32; +#endif +} xai_cnn_selu_params; + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CNN_SELU_GET_LAMBDA16(x) ((x)->lambdaF16) +#define XAI_CNN_SELU_SET_LAMBDA16(x, v) ((x)->lambdaF16 = (v)) +#define XAI_CNN_SELU_GET_ALPHA16(x) ((x)->alphaF16) +#define XAI_CNN_SELU_SET_ALPHA16(x, v) ((x)->alphaF16 = (v)) +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1) || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CNN_SELU_GET_LAMBDA32(x) ((x)->lambdaF32) +#define XAI_CNN_SELU_SET_LAMBDA32(x, v) ((x)->lambdaF32 = (v)) +#define XAI_CNN_SELU_GET_ALPHA32(x) ((x)->alphaF32) +#define XAI_CNN_SELU_SET_ALPHA32(x, v) ((x)->alphaF32 = (v)) +#endif + +typedef struct +{ + int16_t ZeroIn; /* Zero Point value for Input Tile*/ + int16_t ZeroOut; /* Zero Point value for output*/ + uint16_t renormScale; /* Scale applied on (input - ZeroIn) */ + uint8_t renormShift; /* Shift applied to obtain S8 output */ +} xai_cnn_renorm_params; + +#define XAI_CNN_RENORM_GET_ZEROIN(x) ((x)->ZeroIn) +#define XAI_CNN_RENORM_SET_ZEROIN(x, v) ((x)->ZeroIn = (v)) +#define XAI_CNN_RENORM_GET_ZEROOUT(x) ((x)->ZeroOut) +#define XAI_CNN_RENORM_SET_ZEROOUT(x, v) ((x)->ZeroOut = (v)) +#define XAI_CNN_RENORM_GET_RENORMSCALE(x) ((x)->renormScale) +#define XAI_CNN_RENORM_SET_RENORMSCALE(x, v) ((x)->renormScale = (v)) +#define XAI_CNN_RENORM_GET_RENORMSHIFT(x) ((x)->renormShift) +#define XAI_CNN_RENORM_SET_RENORMSHIFT(x, v) ((x)->renormShift = (v)) + +#if (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5) +typedef struct +{ + int32_t ZeroIn; + int32_t ZeroOut; + int32_t requantScale; + int32_t requantShift; + int8_t quantization_mode; +} xai_cnn_tfl_requantize_params; + +#define XAI_CNN_REQUANT_GET_ZEROIN(x) ((x)->ZeroIn) +#define XAI_CNN_REQUANT_SET_ZEROIN(x, v) ((x)->ZeroIn = (v)) +#define XAI_CNN_REQUANT_GET_ZEROOUT(x) ((x)->ZeroOut) +#define XAI_CNN_REQUANT_SET_ZEROOUT(x, v) ((x)->ZeroOut = (v)) +#define XAI_CNN_REQUANT_GET_REQUANTSCALE(x) ((x)->requantScale) +#define XAI_CNN_REQUANT_SET_REQUANTSCALE(x, v) ((x)->requantScale = (v)) +#define XAI_CNN_REQUANT_GET_REQUANTSHIFT(x) ((x)->requantShift) +#define XAI_CNN_REQUANT_SET_REQUANTSHIFT(x, v) ((x)->requantShift = (v)) +#define XAI_CNN_REQUANT_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_REQUANT_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +#else +typedef struct +{ + int16_t ZeroIn; + int16_t ZeroOut; + int32_t requantScale; + int32_t requantShift; + int8_t quantization_mode; +} xai_cnn_tfl_requantize_params; + +#define XAI_CNN_REQUANT_GET_ZEROIN(x) ((x)->ZeroIn) +#define XAI_CNN_REQUANT_SET_ZEROIN(x, v) ((x)->ZeroIn = (v)) +#define XAI_CNN_REQUANT_GET_ZEROOUT(x) ((x)->ZeroOut) +#define XAI_CNN_REQUANT_SET_ZEROOUT(x, v) ((x)->ZeroOut = (v)) +#define XAI_CNN_REQUANT_GET_REQUANTSCALE(x) ((x)->requantScale) +#define XAI_CNN_REQUANT_SET_REQUANTSCALE(x, v) ((x)->requantScale = (v)) +#define XAI_CNN_REQUANT_GET_REQUANTSHIFT(x) ((x)->requantShift) +#define XAI_CNN_REQUANT_SET_REQUANTSHIFT(x, v) ((x)->requantShift = (v)) +#define XAI_CNN_REQUANT_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_REQUANT_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) +#endif + +typedef struct +{ + uint16_t outputScale; /* Scaling factor for Output */ + uint8_t outputShift; /* Shift value to bring the output to 16b */ +} xai_cnn_relu_params; + +#define XAI_CNN_RELU_GET_OUTPUTSCALE(x) ((x)->outputScale) +#define XAI_CNN_RELU_SET_OUTPUTSCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_RELU_GET_OUTPUTSHIFT(x) ((x)->outputShift) +#define XAI_CNN_RELU_SET_OUTPUTSHIFT(x, v) ((x)->outputShift = (v)) + +typedef struct +{ + uint8_t config; // Determines reduction across particular dimensions + uint8_t tileFlag; // Determines which tile is currently being processed + // 0-> intermediate tile, 1-> first tile, 2 --> last tile, 3 --> first and last tile + int32_t fixUpInit; // The fixUp term that is used to incorporte Zero Points + uint8_t accShiftU; // The value by which the accumulated value is right shifted + uint8_t outShiftU; // The value by which the intermediate output value is right shifted + uint16_t outScale; // The value by which acc-shifted value is multiplied to give intermediate output value + uint8_t enableReLu; // Indicates if relu functionality needs to be enabled (1) or not (0) + int64_t reluMin; // The lower limit value which will be used for clamping the outputs + int64_t reluMax; // The upper limit value which will be used for clamping the outputs + bool take_abs; // Indicates if absolute value needs to be taken (true) or not (false) + int32_t redEleCount; // Total number of elements reduced in the output +} xai_cnn_reduce_params; + +#define XAI_CNN_REDUCE_GET_CONFIG(x) ((x)->config) +#define XAI_CNN_REDUCE_GET_TILEFLAG(x) ((x)->tileFlag) +#define XAI_CNN_REDUCE_GET_FIXUPINIT(x) ((x)->fixUpInit) +#define XAI_CNN_REDUCE_GET_ACCSHIFT(x) ((x)->accShiftU) +#define XAI_CNN_REDUCE_GET_OUTPUTSHIFT(x) ((x)->outShiftU) +#define XAI_CNN_REDUCE_GET_OUTPUTSCALE(x) ((x)->outScale) +#define XAI_CNN_REDUCE_GET_FLAG_RELU(x) ((x)->enableReLu) +#define XAI_CNN_REDUCE_GET_RELU_MIN(x) ((x)->reluMin) +#define XAI_CNN_REDUCE_GET_RELU_MAX(x) ((x)->reluMax) +#define XAI_CNN_REDUCE_GET_TAKEABS(x) ((x)->take_abs) +#define XAI_CNN_REDUCE_GET_REDUCED_ELEMENTS_COUNT(x) ((x)->redEleCount) + +#define XAI_CNN_REDUCE_SET_CONFIG(x, v) ((x)->config = v) +#define XAI_CNN_REDUCE_SET_TILEFLAG(x, v) ((x)->tileFlag = v) +#define XAI_CNN_REDUCE_SET_FIXUPINIT(x, v) ((x)->fixUpInit = v) +#define XAI_CNN_REDUCE_SET_ACCSHIFT(x, v) ((x)->accShiftU = v) +#define XAI_CNN_REDUCE_SET_OUTPUTSHIFT(x, v) ((x)->outShiftU = v) +#define XAI_CNN_REDUCE_SET_OUTPUTSCALE(x, v) ((x)->outScale = v) +#define XAI_CNN_REDUCE_SET_FLAG_RELU(x, v) ((x)->enableReLu = v) +#define XAI_CNN_REDUCE_SET_RELU_MIN(x, v) ((x)->reluMin = v) +#define XAI_CNN_REDUCE_SET_RELU_MAX(x, v) ((x)->reluMax = v) +#define XAI_CNN_REDUCE_SET_TAKEABS(x, v) ((x)->take_abs = v) +#define XAI_CNN_REDUCE_SET_REDUCED_ELEMENTS_COUNT(x, v) ((x)->redEleCount = v) + +#define XAI_CNN_REDUCE_DIM1 (0x1) +#define XAI_CNN_REDUCE_DIM2 (0x2) +#define XAI_CNN_REDUCE_DIM3 (0x4) +#define XAI_CNN_REDUCE_DIM4 (0x8) + +#define XAI_CNN_REDUCE_DIM12 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2) +#define XAI_CNN_REDUCE_DIM13 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM3) +#define XAI_CNN_REDUCE_DIM14 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM4) +#define XAI_CNN_REDUCE_DIM23 (XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3) +#define XAI_CNN_REDUCE_DIM24 (XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM4) + +#define XAI_CNN_REDUCE_DIM34 (XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4) +#define XAI_CNN_REDUCE_DIM123 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3) +#define XAI_CNN_REDUCE_DIM124 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM4) +#define XAI_CNN_REDUCE_DIM134 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4) + +#define XAI_CNN_REDUCE_DIM234 (XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4) +#define XAI_CNN_REDUCE_DIM1234 (XAI_CNN_REDUCE_DIM1 | XAI_CNN_REDUCE_DIM2 | XAI_CNN_REDUCE_DIM3 | XAI_CNN_REDUCE_DIM4) + +#define XAI_CNN_REDUCE_INTERMEDIATE_TILE 0 +#define XAI_CNN_REDUCE_FIRST_TILE 1 +#define XAI_CNN_REDUCE_LAST_TILE 2 +#define XAI_CNN_REDUCE_FIRST_LAST_TILE 3 + +/* Matrix Multiplication Params */ +typedef struct +{ + uint8_t accumShift; // Accumulator Shift - Shift to convert accumulator data to 16 bit + uint16_t outputScale; // Amount by which shifted data is scaled + uint8_t outputShift; // Shift amount to convert the scaled data to 16 bit + int8_t zeroPointIn1; // zero point for assymetric input1 data + int8_t zeroPointIn2; // zero point for assymetric input2 data +} xai_cnn_matmul_params; + +#define XAI_CNN_MATMUL_GET_ACCUM_SHIFT(x) ((x)->accumShift) +#define XAI_CNN_MATMUL_SET_ACCUM_SHIFT(x, v) ((x)->accumShift = (v)) +#define XAI_CNN_MATMUL_GET_OUTPUT_SCALE(x) ((x)->outputScale) +#define XAI_CNN_MATMUL_SET_OUTPUT_SCALE(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_MATMUL_GET_OUTPUT_SHIFT(x) ((x)->outputShift) +#define XAI_CNN_MATMUL_SET_OUTPUT_SHIFT(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_MATMUL_GET_ZERO_POINT1(x) ((x)->zeroPointIn1) +#define XAI_CNN_MATMUL_SET_ZERO_POINT1(x, v) ((x)->zeroPointIn1 = (v)) +#define XAI_CNN_MATMUL_GET_ZERO_POINT2(x) ((x)->zeroPointIn2) +#define XAI_CNN_MATMUL_SET_ZERO_POINT2(x, v) ((x)->zeroPointIn2 = (v)) + +/* Matrix Multiplication TFL Params */ +typedef struct +{ + int32_t outputScale; + int32_t outputShift; + int32_t lhsTranspose; // Can be 0 or 1 + int32_t rhsTranspose; // Can be 0 or 1 + int32_t lhsOffset; + int32_t rhsOffset; + int32_t outOffset; + int32_t lhsBatch0; + int32_t lhsBatch1; + int32_t lhsBatch2; + int32_t rhsBatch0; + int32_t rhsBatch1; + int32_t rhsBatch2; + int32_t outBatch0; + int32_t outBatch1; + int32_t outBatch2; + int8_t quantization_mode; +} xai_cnn_tfl_matmul_params; + +#define XAI_CNN_MATMUL_GET_OUTPUT_SCALE_TFL(x) ((x)->outputScale) +#define XAI_CNN_MATMUL_SET_OUTPUT_SCALE_TFL(x, v) ((x)->outputScale = (v)) +#define XAI_CNN_MATMUL_GET_OUTPUT_SHIFT_TFL(x) ((x)->outputShift) +#define XAI_CNN_MATMUL_SET_OUTPUT_SHIFT_TFL(x, v) ((x)->outputShift = (v)) +#define XAI_CNN_MATMUL_GET_LHS_TRANSPOSE_TFL(x) ((x)->lhsTranspose) +#define XAI_CNN_MATMUL_SET_LHS_TRANSPOSE_TFL(x, v) ((x)->lhsTranspose = (v)) +#define XAI_CNN_MATMUL_GET_RHS_TRANSPOSE_TFL(x) ((x)->rhsTranspose) +#define XAI_CNN_MATMUL_SET_RHS_TRANSPOSE_TFL(x, v) ((x)->rhsTranspose = (v)) +#define XAI_CNN_MATMUL_GET_LHS_OFFSET_TFL(x) ((x)->lhsOffset) +#define XAI_CNN_MATMUL_SET_LHS_OFFSET_TFL(x, v) ((x)->lhsOffset = (v)) +#define XAI_CNN_MATMUL_GET_RHS_OFFSET_TFL(x) ((x)->rhsOffset) +#define XAI_CNN_MATMUL_SET_RHS_OFFSET_TFL(x, v) ((x)->rhsOffset = (v)) +#define XAI_CNN_MATMUL_GET_OUT_OFFSET_TFL(x) ((x)->outOffset) +#define XAI_CNN_MATMUL_SET_OUT_OFFSET_TFL(x, v) ((x)->outOffset = (v)) +#define XAI_CNN_MATMUL_GET_LHS_BATCH0_TFL(x) ((x)->lhsBatch0) +#define XAI_CNN_MATMUL_SET_LHS_BATCH0_TFL(x, v) ((x)->lhsBatch0 = (v)) +#define XAI_CNN_MATMUL_GET_LHS_BATCH1_TFL(x) ((x)->lhsBatch1) +#define XAI_CNN_MATMUL_SET_LHS_BATCH1_TFL(x, v) ((x)->lhsBatch1 = (v)) +#define XAI_CNN_MATMUL_GET_LHS_BATCH2_TFL(x) ((x)->lhsBatch2) +#define XAI_CNN_MATMUL_SET_LHS_BATCH2_TFL(x, v) ((x)->lhsBatch2 = (v)) +#define XAI_CNN_MATMUL_GET_RHS_BATCH0_TFL(x) ((x)->rhsBatch0) +#define XAI_CNN_MATMUL_SET_RHS_BATCH0_TFL(x, v) ((x)->rhsBatch0 = (v)) +#define XAI_CNN_MATMUL_GET_RHS_BATCH1_TFL(x) ((x)->rhsBatch1) +#define XAI_CNN_MATMUL_SET_RHS_BATCH1_TFL(x, v) ((x)->rhsBatch1 = (v)) +#define XAI_CNN_MATMUL_GET_RHS_BATCH2_TFL(x) ((x)->rhsBatch2) +#define XAI_CNN_MATMUL_SET_RHS_BATCH2_TFL(x, v) ((x)->rhsBatch2 = (v)) +#define XAI_CNN_MATMUL_GET_OUT_BATCH0_TFL(x) ((x)->outBatch0) +#define XAI_CNN_MATMUL_SET_OUT_BATCH0_TFL(x, v) ((x)->outBatch0 = (v)) +#define XAI_CNN_MATMUL_GET_OUT_BATCH1_TFL(x) ((x)->outBatch1) +#define XAI_CNN_MATMUL_SET_OUT_BATCH1_TFL(x, v) ((x)->outBatch1 = (v)) +#define XAI_CNN_MATMUL_GET_OUT_BATCH2_TFL(x) ((x)->outBatch2) +#define XAI_CNN_MATMUL_SET_OUT_BATCH2_TFL(x, v) ((x)->outBatch2 = (v)) +#define XAI_CNN_MATMUL_GET_QUANTIZATION_MODE(x) ((x)->quantization_mode) +#define XAI_CNN_MATMUL_SET_QUANTIZATION_MODE(x, v) ((x)->quantization_mode = (v)) + +/*Crop3DWithStride Params*/ +typedef struct +{ + int32_t offsH; + int32_t offsW; + int32_t offsD; + int32_t strideH; + int32_t strideW; + int32_t strideD; +} xai_cnn_crop3DWithStride_params; + +#define XAI_CNN_CROP3DWITHSTRIDE_GET_OFFSD(x) ((x)->offsD); +#define XAI_CNN_CROP3DWITHSTRIDE_GET_OFFSW(x) ((x)->offsW); +#define XAI_CNN_CROP3DWITHSTRIDE_GET_OFFSH(x) ((x)->offsH); +#define XAI_CNN_CROP3DWITHSTRIDE_GET_STRIDED(x) ((x)->strideD); +#define XAI_CNN_CROP3DWITHSTRIDE_GET_STRIDEW(x) ((x)->strideW); +#define XAI_CNN_CROP3DWITHSTRIDE_GET_STRIDEH(x) ((x)->strideH); +#define XAI_CNN_CROP3DWITHSTRIDE_SET_OFFSD(x, v) ((x)->offsD = (v)) +#define XAI_CNN_CROP3DWITHSTRIDE_SET_OFFSW(x, v) ((x)->offsW = (v)) +#define XAI_CNN_CROP3DWITHSTRIDE_SET_OFFSH(x, v) ((x)->offsH = (v)) +#define XAI_CNN_CROP3DWITHSTRIDE_SET_STRIDED(x, v) ((x)->strideD = (v)) +#define XAI_CNN_CROP3DWITHSTRIDE_SET_STRIDEW(x, v) ((x)->strideW = (v)) +#define XAI_CNN_CROP3DWITHSTRIDE_SET_STRIDEH(x, v) ((x)->strideH = (v)) + +typedef struct +{ + float scale; + int32_t offset; + int32_t axis; +} xai_cnn_quantDequantA_params; +#define XAI_CNN_QUANT_DEQUANT_GET_SCALE(x) ((x)->scale) +#define XAI_CNN_QUANT_DEQUANT_SET_SCALE(x, v) ((x)->scale = (v)) +#define XAI_CNN_QUANT_DEQUANT_GET_OFFSET(x) ((x)->offset) +#define XAI_CNN_QUANT_DEQUANT_SET_OFFSET(x, v) ((x)->offset = (v)) +#define XAI_CNN_QUANT_DEQUANT_GET_AXIS(x) ((x)->axis) +#define XAI_CNN_QUANT_DEQUANT_SET_AXIS(x, v) ((x)->axis = (v)) + +typedef struct +{ + xai_cnn_conv_params fcInputParamIG; + xai_cnn_conv_params fcInputParamFG; + xai_cnn_conv_params fcInputParamOG; + xai_cnn_conv_params fcInputParamMI; + xai_cnn_conv_params fcHiddenParamIG; + xai_cnn_conv_params fcHiddenParamFG; + xai_cnn_conv_params fcHiddenParamOG; + xai_cnn_conv_params fcHiddenParamMI; + + xai_cnn_sigmoid_params sigmoidParamIG; + xai_cnn_sigmoid_params sigmoidParamFG; + xai_cnn_sigmoid_params sigmoidParamOG; + xai_cnn_tanh_params tanhParamMI; + + xai_cnn_tfl_eltwise_params eltMulParamHS1; + xai_cnn_tfl_eltwise_params eltMulParamHS2; + + xai_cnn_tanh_params tanhParamCS; + xai_cnn_tfl_eltwise_params eltMulParamCS; + + int16_t clipMin; + int16_t clipMax; + + int32_t timeMajorAxis; + int32_t direction; +} xai_lstm_tfl_params; + +#define XAI_CNN_LSTM_GET_FC_INPUT_IG_PARAM(x) ((x)->fcInputParamIG) +#define XAI_CNN_LSTM_SET_FC_INPUT_IG_PARAM(x, v) ((x)->fcInputParamIG = (v)) +#define XAI_CNN_LSTM_GET_FC_INPUT_FG_PARAM(x) ((x)->fcInputParamFG) +#define XAI_CNN_LSTM_SET_FC_INPUT_FG_PARAM(x, v) ((x)->fcInputParamFG = (v)) +#define XAI_CNN_LSTM_GET_FC_INPUT_OG_PARAM(x) ((x)->fcInputParamOG) +#define XAI_CNN_LSTM_SET_FC_INPUT_OG_PARAM(x, v) ((x)->fcInputParamOG = (v)) +#define XAI_CNN_LSTM_GET_FC_INPUT_MI_PARAM(x) ((x)->fcInputParamMI) +#define XAI_CNN_LSTM_SET_FC_INPUT_MI_PARAM(x, v) ((x)->fcInputParamMI = (v)) + +#define XAI_CNN_LSTM_GET_FC_HIDDEN_IG_PARAM(x) ((x)->fcHiddenParamIG) +#define XAI_CNN_LSTM_SET_FC_HIDDEN_IG_PARAM(x, v) ((x)->fcHiddenParamIG = (v)) +#define XAI_CNN_LSTM_GET_FC_HIDDEN_FG_PARAM(x) ((x)->fcHiddenParamFG) +#define XAI_CNN_LSTM_SET_FC_HIDDEN_FG_PARAM(x, v) ((x)->fcHiddenParamFG = (v)) +#define XAI_CNN_LSTM_GET_FC_HIDDEN_OG_PARAM(x) ((x)->fcHiddenParamOG) +#define XAI_CNN_LSTM_SET_FC_HIDDEN_OG_PARAM(x, v) ((x)->fcHiddenParamOG = (v)) +#define XAI_CNN_LSTM_GET_FC_HIDDEN_MI_PARAM(x) ((x)->fcHiddenParamMI) +#define XAI_CNN_LSTM_SET_FC_HIDDEN_MI_PARAM(x, v) ((x)->fcHiddenParamMI = (v)) + +#define XAI_CNN_LSTM_GET_SIGMOID_IG_PARAM(x) ((x)->sigmoidParamIG) +#define XAI_CNN_LSTM_SET_SIGMOID_IG_PARAM(x, v) ((x)->sigmoidParamIG = (v)) +#define XAI_CNN_LSTM_GET_SIGMOID_FG_PARAM(x) ((x)->sigmoidParamFG) +#define XAI_CNN_LSTM_SET_SIGMOID_FG_PARAM(x, v) ((x)->sigmoidParamFG = (v)) +#define XAI_CNN_LSTM_GET_SIGMOID_OG_PARAM(x) ((x)->sigmoidParamOG) +#define XAI_CNN_LSTM_SET_SIGMOID_OG_PARAM(x, v) ((x)->sigmoidParamOG = (v)) +#define XAI_CNN_LSTM_GET_TANH_MI_PARAM(x) ((x)->tanhParamMI) +#define XAI_CNN_LSTM_SET_TANH_MI_PARAM(x, v) ((x)->tanhParamMI = (v)) + +#define XAI_CNN_LSTM_GET_ELTWISE_MUL_HS1_PARAM(x) ((x)->eltMulParamHS1) +#define XAI_CNN_LSTM_SET_ELTWISE_MUL_HS1_PARAM(x, v) ((x)->eltMulParamHS1 = (v)) +#define XAI_CNN_LSTM_GET_ELTWISE_MUL_HS2_PARAM(x) ((x)->eltMulParamHS2) +#define XAI_CNN_LSTM_SET_ELTWISE_MUL_HS2_PARAM(x, v) ((x)->eltMulParamHS2 = (v)) + +#define XAI_CNN_LSTM_GET_TANH_CS_PARAM(x) ((x)->tanhParamCS) +#define XAI_CNN_LSTM_SET_TANH_CS_PARAM(x, v) ((x)->tanhParamCS = (v)) +#define XAI_CNN_LSTM_GET_ELTWISE_MUL_CS_PARAM(x) ((x)->eltMulParamCS) +#define XAI_CNN_LSTM_SET_ELTWISE_MUL_CS_PARAM(x, v) ((x)->eltMulParamCS = (v)) + +#define XAI_CNN_LSTM_GET_CLIP_MIN(x) ((x)->clipMin) +#define XAI_CNN_LSTM_SET_CLIP_MIN(x, v) ((x)->clipMin = (v)) +#define XAI_CNN_LSTM_GET_CLIP_MAX(x) ((x)->clipMax) +#define XAI_CNN_LSTM_SET_CLIP_MAX(x, v) ((x)->clipMax = (v)) + +#define XAI_CNN_LSTM_GET_TIME_MAJOR_AXIS(x) ((x)->timeMajorAxis) +#define XAI_CNN_LSTM_SET_TIME_MAJOR_AXIS(x, v) ((x)->timeMajorAxis = (v)) +#define XAI_CNN_LSTM_GET_DIRECTION(x) ((x)->direction) +#define XAI_CNN_LSTM_SET_DIRECTION(x, v) ((x)->direction = (v)) + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +typedef struct +{ + xai_cnn_conv_params fcInputParamIG; + xai_cnn_conv_params fcInputParamFG; + xai_cnn_conv_params fcInputParamOG; + xai_cnn_conv_params fcInputParamMI; + xai_cnn_conv_params fcHiddenParamIG; + xai_cnn_conv_params fcHiddenParamFG; + xai_cnn_conv_params fcHiddenParamOG; + xai_cnn_conv_params fcHiddenParamMI; + + xai_cnn_eltwiseMul_params eltMulParamHS1; + xai_cnn_eltwiseMul_params eltMulParamHS2; + + xai_cnn_eltwiseMul_params eltMulParamCS; + + xb_f16 clipMinFP16; + xb_f16 clipMaxFP16; + + int32_t timeMajorAxis; + int32_t direction; +} xai_lstm_F16_params; + +#define XAI_CNN_LSTM_F16_GET_FC_INPUT_IG_PARAM(x) ((x)->fcInputParamIG) +#define XAI_CNN_LSTM_F16_SET_FC_INPUT_IG_PARAM(x, v) ((x)->fcInputParamIG = (v)) +#define XAI_CNN_LSTM_F16_GET_FC_INPUT_FG_PARAM(x) ((x)->fcInputParamFG) +#define XAI_CNN_LSTM_F16_SET_FC_INPUT_FG_PARAM(x, v) ((x)->fcInputParamFG = (v)) +#define XAI_CNN_LSTM_F16_GET_FC_INPUT_OG_PARAM(x) ((x)->fcInputParamOG) +#define XAI_CNN_LSTM_F16_SET_FC_INPUT_OG_PARAM(x, v) ((x)->fcInputParamOG = (v)) +#define XAI_CNN_LSTM_F16_GET_FC_INPUT_MI_PARAM(x) ((x)->fcInputParamMI) +#define XAI_CNN_LSTM_F16_SET_FC_INPUT_MI_PARAM(x, v) ((x)->fcInputParamMI = (v)) + +#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_IG_PARAM(x) ((x)->fcHiddenParamIG) +#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_IG_PARAM(x, v) ((x)->fcHiddenParamIG = (v)) +#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_FG_PARAM(x) ((x)->fcHiddenParamFG) +#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_FG_PARAM(x, v) ((x)->fcHiddenParamFG = (v)) +#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_OG_PARAM(x) ((x)->fcHiddenParamOG) +#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_OG_PARAM(x, v) ((x)->fcHiddenParamOG = (v)) +#define XAI_CNN_LSTM_F16_GET_FC_HIDDEN_MI_PARAM(x) ((x)->fcHiddenParamMI) +#define XAI_CNN_LSTM_F16_SET_FC_HIDDEN_MI_PARAM(x, v) ((x)->fcHiddenParamMI = (v)) + +#define XAI_CNN_LSTM_F16_GET_ELTWISE_MUL_HS1_PARAM(x) ((x)->eltMulParamHS1) +#define XAI_CNN_LSTM_F16_SET_ELTWISE_MUL_HS1_PARAM(x, v) ((x)->eltMulParamHS1 = (v)) +#define XAI_CNN_LSTM_F16_GET_ELTWISE_MUL_HS2_PARAM(x) ((x)->eltMulParamHS2) +#define XAI_CNN_LSTM_F16_SET_ELTWISE_MUL_HS2_PARAM(x, v) ((x)->eltMulParamHS2 = (v)) + +#define XAI_CNN_LSTM_F16_GET_ELTWISE_MUL_CS_PARAM(x) ((x)->eltMulParamCS) +#define XAI_CNN_LSTM_F16_SET_ELTWISE_MUL_CS_PARAM(x, v) ((x)->eltMulParamCS = (v)) + +#define XAI_CNN_LSTM_F16_GET_CLIP_MIN(x) ((x)->clipMinFP16) +#define XAI_CNN_LSTM_F16_SET_CLIP_MIN(x, v) ((x)->clipMinFP16 = (v)) +#define XAI_CNN_LSTM_F16_GET_CLIP_MAX(x) ((x)->clipMaxFP16) +#define XAI_CNN_LSTM_F16_SET_CLIP_MAX(x, v) ((x)->clipMaxFP16 = (v)) + +#define XAI_CNN_LSTM_F16_GET_TIME_MAJOR_AXIS(x) ((x)->timeMajorAxis) +#define XAI_CNN_LSTM_F16_SET_TIME_MAJOR_AXIS(x, v) ((x)->timeMajorAxis = (v)) +#define XAI_CNN_LSTM_F16_GET_DIRECTION(x) ((x)->direction) +#define XAI_CNN_LSTM_F16_SET_DIRECTION(x, v) ((x)->direction = (v)) +#endif + +typedef struct +{ + xai_cnn_conv_params fcInputParamRG; + xai_cnn_conv_params fcInputParamUG; + xai_cnn_conv_params fcInputParamMS; + xai_cnn_conv_params fcHiddenParamRG; + xai_cnn_conv_params fcHiddenParamUG; + xai_cnn_conv_params fcHiddenParamMS; + + xai_cnn_sigmoid_params sigmoidParamRG; + xai_cnn_sigmoid_params sigmoidParamUG; + xai_cnn_tfl_eltwise_params eltMulParamMS; + xai_cnn_tanh_params tanhParamMS; + + xai_cnn_tfl_eltwise_params eltMulParamHS1; + xai_cnn_tfl_eltwise_params eltMulParamHS2; + + int32_t eltAddOutOffsetHS; // NOTE: eltAddOutOffsetHS is not used in S16 variant. For S16 variant, set it to 0. + int32_t timeMajorAxis; + int32_t direction; +} xai_gru_tfl_params; + +#define XAI_CNN_GRU_GET_FC_INPUT_RG_PARAM(x) ((x)->fcInputParamRG) +#define XAI_CNN_GRU_SET_FC_INPUT_RG_PARAM(x, v) ((x)->fcInputParamRG = (v)) +#define XAI_CNN_GRU_GET_FC_INPUT_UG_PARAM(x) ((x)->fcInputParamUG) +#define XAI_CNN_GRU_SET_FC_INPUT_UG_PARAM(x, v) ((x)->fcInputParamUG = (v)) +#define XAI_CNN_GRU_GET_FC_INPUT_MS_PARAM(x) ((x)->fcInputParamMS) +#define XAI_CNN_GRU_SET_FC_INPUT_MS_PARAM(x, v) ((x)->fcInputParamMS = (v)) + +#define XAI_CNN_GRU_GET_FC_HIDDEN_RG_PARAM(x) ((x)->fcHiddenParamRG) +#define XAI_CNN_GRU_SET_FC_HIDDEN_RG_PARAM(x, v) ((x)->fcHiddenParamRG = (v)) +#define XAI_CNN_GRU_GET_FC_HIDDEN_UG_PARAM(x) ((x)->fcHiddenParamUG) +#define XAI_CNN_GRU_SET_FC_HIDDEN_UG_PARAM(x, v) ((x)->fcHiddenParamUG = (v)) +#define XAI_CNN_GRU_GET_FC_HIDDEN_MS_PARAM(x) ((x)->fcHiddenParamMS) +#define XAI_CNN_GRU_SET_FC_HIDDEN_MS_PARAM(x, v) ((x)->fcHiddenParamMS = (v)) + +#define XAI_CNN_GRU_GET_SIGMOID_RG_PARAM(x) ((x)->sigmoidParamRG) +#define XAI_CNN_GRU_SET_SIGMOID_RG_PARAM(x, v) ((x)->sigmoidParamRG = (v)) +#define XAI_CNN_GRU_GET_SIGMOID_UG_PARAM(x) ((x)->sigmoidParamUG) +#define XAI_CNN_GRU_SET_SIGMOID_UG_PARAM(x, v) ((x)->sigmoidParamUG = (v)) +#define XAI_CNN_GRU_GET_ELTWISE_MUL_MS_PARAM(x) ((x)->eltMulParamMS) +#define XAI_CNN_GRU_SET_ELTWISE_MUL_MS_PARAM(x, v) ((x)->eltMulParamMS = (v)) +#define XAI_CNN_GRU_GET_TANH_MS_PARAM(x) ((x)->tanhParamMS) +#define XAI_CNN_GRU_SET_TANH_MS_PARAM(x, v) ((x)->tanhParamMS = (v)) + +#define XAI_CNN_GRU_GET_ELTWISE_MUL_HS1_PARAM(x) ((x)->eltMulParamHS1) +#define XAI_CNN_GRU_SET_ELTWISE_MUL_HS1_PARAM(x, v) ((x)->eltMulParamHS1 = (v)) +#define XAI_CNN_GRU_GET_ELTWISE_MUL_HS2_PARAM(x) ((x)->eltMulParamHS2) +#define XAI_CNN_GRU_SET_ELTWISE_MUL_HS2_PARAM(x, v) ((x)->eltMulParamHS2 = (v)) + +#define XAI_CNN_GRU_GET_ELTWISE_ADD_HS_OUT_OFFSET(x) ((x)->eltAddOutOffsetHS) +#define XAI_CNN_GRU_SET_ELTWISE_ADD_HS_OUT_OFFSET(x, v) ((x)->eltAddOutOffsetHS = (v)) +#define XAI_CNN_GRU_GET_TIME_MAJOR_AXIS(x) ((x)->timeMajorAxis) +#define XAI_CNN_GRU_SET_TIME_MAJOR_AXIS(x, v) ((x)->timeMajorAxis = (v)) +#define XAI_CNN_GRU_GET_DIRECTION(x) ((x)->direction) +#define XAI_CNN_GRU_SET_DIRECTION(x, v) ((x)->direction = (v)) +#endif // #ifndef __XAI_CNN_API_PARAMS_H__ diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h new file mode 100644 index 00000000000..34b5aec2008 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_common.h @@ -0,0 +1,4329 @@ +/* + * Copyright (c) 2021 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CNN_COMMON_H__ +#define __XAI_CNN_COMMON_H__ + +#include "xai_tile_manager.h" +#include "xai_core.h" +#include "xai_cnn_api_common.h" +#include "limits.h" + +// frequently used macros for rounding and clamping +#ifndef MAX2 +#define MAX2(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +#ifndef MIN2 +#define MIN2(a, b) (((a) > (b)) ? (b) : (a)) +#endif +#define CLAMP(v, min, max) ((v) < (min) ? (min) : (v) > (max) ? (max) : (v)) +#define ROUND(x, s) (((s) == 0) ? (x) : (((x) + (1 << ((s) - 1))) >> (s))) +#define ROUND_N_CLAMP(x, s, min, max) (((s) == 0) ? (CLAMP(x, min, max)) : (CLAMP(ROUND(x, s), min, max))) +#define ROUND64B(x, s) (((s) == 0) ? (x) : \ + (((x) + ((int64_t) 1 << ((s) - 1))) >> (s))) +#define ROUND_N_CLAMP64B(x, s, min, max) (((s) == 0) ? (CLAMP(x, min, max)) : \ + (CLAMP(ROUND64B(x, s), min, max))) +#define ROI_CEIL(x, s) (((s) == 0) ? (x) : (((x) + (1 << ((s)))) >> (s))) + +#ifndef XCHAL_IVPN_SIMD_WIDTH +#define XCHAL_IVPN_SIMD_WIDTH 32 +#endif + +/* Macros used for morphing various APIs */ +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define INTEGER8BIT 6 +#define INTEGER16BIT 7 +#define FLOAT16BIT 8 +#define FLOAT32BIT 9 +#define SIGNED8BITUNSIGNED8BIT 10 +#define UNSIGNED8BITSIGNED8BIT 11 +#define SIGNED8BITSIGNED16BIT 12 +#define UNSIGNED8BITSIGNED16BIT 13 +#define SIGNED16BITSIGNED16BIT 14 +#define UNSIGNED32BIT 16 +#define INPUT16BITFLOAT 17 +#define INPUT8BIT 18 +#define INPUT16BIT 19 +#define INPUT32BIT 20 +#define SIGNED64BIT 21 +#define UNSIGNED64BIT 22 + +#define QP_DEPTH_U8 ((uint8_t) UCHAR_MAX) +#define QP_DEPTH_U16 ((uint16_t) USHRT_MAX) +#define QP_DEPTH_S16 ((int16_t) SHRT_MAX) +#define QP_DEPTH_S8 ((uint8_t) SCHAR_MAX) + +#define ADAPTIVE_AVG_POOL_Q_FORMAT 15 + +#define CALC_NSA_32(input, count) \ + { \ + count = 0; \ + int32_t mask = 0x80000000; \ + int32_t index = 31; \ + /*Determining the sign of the input*/ \ + int32_t sign = (input & mask) >> index & 0x00000001; \ + mask = 0x40000000; \ + index--; \ + /*Finding the count leading zeros incase of positive number \ + and count leading ones in case of negative number excluding \ + the sign bit*/ \ + while ((sign == ((input & mask) >> index)) && (mask != 0)) \ + { \ + count += 1; \ + mask = mask >> 1; \ + index--; \ + } \ + } + +#define CONVERT_FP16_TO_FP32(F16Data) ( \ + { \ + int signBit, scaleSign, storedExponent; \ + int trueExponent; \ + int significand, i; \ + float expVal, bitVal, temp, fractionFloat; \ + float implicitSignificand_val; \ + \ + trueExponent = 0; \ + implicitSignificand_val = 0; \ + float floatVal = 0; \ + \ + unsigned short F16Data_U16 = (unsigned short) F16Data; \ + int hex_val_fp16 = (int) F16Data_U16; \ + \ + signBit = (hex_val_fp16 >> 15); \ + scaleSign = ((signBit == 0) ? (1) : (-1)); \ + storedExponent = ((hex_val_fp16 & 0x7fff) >> 10); \ + significand = (hex_val_fp16 & 0x03ff); \ + \ + if (storedExponent == 31) \ + { \ + if (scaleSign == 1) \ + { \ + if (significand == 0) \ + { \ + floatVal = +INFINITY; \ + return (floatVal); \ + } \ + else if (significand != 0) \ + { \ + floatVal = -NAN; /* +nan */ \ + return (floatVal); \ + } \ + } \ + else if (scaleSign == -1) \ + { \ + if (significand == 0) \ + { \ + floatVal = -INFINITY; \ + return (floatVal); \ + } \ + else if (significand != 0) \ + { \ + floatVal = NAN; /* -nan */ \ + return (floatVal); \ + } \ + } \ + } \ + else if (storedExponent == 0) \ + { \ + trueExponent = -14; \ + implicitSignificand_val = 0.0f; \ + \ + if (scaleSign == 1) \ + { \ + if (significand == 0) \ + { \ + floatVal = 0; \ + return (floatVal); \ + } \ + } \ + else if (scaleSign == -1) \ + { \ + if (significand == 0) \ + { \ + floatVal = -0; \ + return (floatVal); \ + } \ + } \ + } \ + else if ((storedExponent > 0) && (storedExponent < 31)) \ + { \ + trueExponent = storedExponent - 15; \ + implicitSignificand_val = 1.0f; \ + } \ + \ + expVal = powf(2, (float) trueExponent); \ + \ + fractionFloat = 0.0f; \ + for (i = 10; i > 0; i--) \ + { \ + bitVal = (float) (significand & 0x1); \ + temp = bitVal / (1 << i); \ + fractionFloat = fractionFloat + temp; \ + \ + significand = significand >> 1; \ + } \ + fractionFloat = fractionFloat + implicitSignificand_val; \ + \ + scaleSign * expVal * fractionFloat; \ + }) + +#define XAI_CHECK_TILE3D_EDGE(tile, edge) \ + if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_WHD) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM1_EDGE2(tile) >= edge && \ + XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge, \ + XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge "-pixel edge extension"); \ + } \ + else if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_DWH) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge && \ + XAI_TILE3D_GET_DIM3_EDGE1(tile) >= edge && XAI_TILE3D_GET_DIM3_EDGE2(tile) >= edge, \ + XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge "-pixel edge extension"); \ + } \ + +#define XAI_CHECK_TILE3D_EDGE2(tile, edge1, edge2) \ + if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_WHD) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1_EDGE1(tile) >= edge1 && XAI_TILE3D_GET_DIM1_EDGE2(tile) >= edge1 && \ + XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge2 && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge2, \ + XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge1 #edge2 "-pixel edge extension"); \ + } \ + else if (XAI_TILE3D_GET_DATA_ORDER(tile) == XAI_DWH) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2_EDGE1(tile) >= edge1 && XAI_TILE3D_GET_DIM2_EDGE2(tile) >= edge1 && \ + XAI_TILE3D_GET_DIM3_EDGE1(tile) >= edge2 && XAI_TILE3D_GET_DIM3_EDGE2(tile) >= edge2, \ + XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge1 #edge2 "-pixel edge extension"); \ + } + +#define XAI_CHECK_TILE3D_DATA_ORDER(tile, type) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(tile) == type, XAI_ERR_BADARG, "The Data Order of (" #tile ") is not supported by this function") + +#define XAI_CHECK_TILE4D_DATA_ORDER(tile, type) \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DATA_ORDER(tile) == type, XAI_ERR_BADARG, "The Data Order of (" #tile ") is not supported by this function") + +#define XAI_CHECK_KERNEL_SIZE(coeffT, size) \ + if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_WHDN) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffT) == size) && (XAI_TILE4D_GET_DIM2(coeffT) == size), \ + XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported"); \ + } \ + else if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_NDWH) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM3(coeffT) == size) && (XAI_TILE4D_GET_DIM4(coeffT) == size), \ + XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported"); \ + } + +#define XAI_CHECK_CONV_OUTPUT_TILE3D(outTile) \ + XAI_CHECK_TILE3D(outTile); \ + XAI_CHECK_ERROR((XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) \ + || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)), \ + XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type"); + +#define XAI_CHECK_CONV_I16_OUTPUT_TILE3D(outTile) \ + XAI_CHECK_TILE3D(outTile); \ + XAI_CHECK_ERROR((XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) \ + || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)), \ + XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type"); +#define XAI_CHECK_CONV_OUTPUT_IX_TILE3D(outTile) \ + XAI_CHECK_TILE3D(outTile); \ + XAI_CHECK_ERROR((XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) \ + || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) || (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)), \ + XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type"); + +#define XAI_CHECK_CONV_OUTPUT_TILE4D(outTile) \ + XAI_CHECK_TILE4D(outTile); \ + XAI_CHECK_ERROR((XAI_TILE4D_CHECK_TYPE(outTile, XAI_U8)) || (XAI_TILE4D_CHECK_TYPE(outTile, XAI_S8)) \ + || (XAI_TILE4D_CHECK_TYPE(outTile, XAI_S16)), \ + XAI_ERR_DATATYPE, "The argument (" #outTile ") has wrong type"); + +#define XAI_CHECK_STRIDE(param, stride) \ + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_STRIDE(param) == stride, XAI_ERR_BADARG, "The stride amount provided is not supported."); + +#define XAI_CHECK_DILATION(param, dilation) \ + XAI_CHECK_ERROR(XAI_CNN_CONV_GET_DILATION(param) == dilation, XAI_ERR_BADARG, "The dilation value provided is not supported."); + + +#define XAI_CHECK_POOLING_STRIDE(param, stride) \ + XAI_CHECK_ERROR(XAI_CNN_POOLING_GET_STRIDE(param) == stride, XAI_ERR_BADARG, "The stride amount provided is not supported."); + +#define XAI_CHECK_CONSISTENCY_MOD_DWH(inT, coeffT, biasArr, outT, param) \ + uint16_t dilatedKW_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1); \ + uint16_t dilatedKH_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffT) - 1) + 1); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE4D_GET_DIM2(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + if (dilatedKW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1) \ + + (dilatedKW_MOD >> 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1) \ + + ((dilatedKW_MOD >> 1) - 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedKH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1) \ + + (dilatedKH_MOD >> 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1) \ + + ((dilatedKH_MOD >> 1) - 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of Kernels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_CONSISTENCY_MOD_DWH_IN16DWH(inT, offsetArr, coeffT, biasArr, outT, param) \ + uint16_t dilatedKW_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1); \ + uint16_t dilatedKH_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_TILE3D_GET_DIM1(inT), 2 * XCHAL_IVPN_SIMD_WIDTH)) == (XAI_TILE4D_GET_DIM1(coeffT) >> 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_TILE3D_GET_DIM1(outT), 2 * XCHAL_IVPN_SIMD_WIDTH)) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + if (dilatedKW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1) \ + + (dilatedKW_MOD >> 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_MOD >> 1) \ + + ((dilatedKW_MOD >> 1) - 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedKH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1) \ + + (dilatedKH_MOD >> 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_MOD >> 1) \ + + ((dilatedKH_MOD >> 1) - 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= (XAI_TILE3D_GET_DIM1(outT)), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of Kernels."); \ + XAI_CHECK_ERROR((XAI_ARRAY_GET_WIDTH(offsetArr) >= \ + (XAI_TILE4D_GET_DIM2(coeffT) * XAI_TILE4D_GET_DIM3(coeffT) * (XAI_ALIGN_VAL(XAI_TILE3D_GET_DIM1(inT), 2 * XCHAL_IVPN_SIMD_WIDTH) >> 4))), \ + XAI_ERR_DATASIZE, "Input offset Array size should be equal to kernelHeight * kernelWidth * (ALIGN(InputChannels,16)/16)."); \ + +#define XAI_CHECK_CONSISTENCY_MOD_WHD_DWH(inT, coeffT, biasArr, outT, param) \ + uint16_t dilatedKW_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1); \ + uint16_t dilatedKH_MOD = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffT) - 1) + 1); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE4D_GET_DIM2(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + if (dilatedKW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOD >> 1) \ + + (dilatedKW_MOD >> 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOD >> 1) \ + + ((dilatedKW_MOD >> 1) - 1) - dilatedKW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedKH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOD >> 1) \ + + (dilatedKH_MOD >> 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOD >> 1) \ + + ((dilatedKH_MOD >> 1) - 1) - dilatedKH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of Kernels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_CONSISTENCY_MOW_WHD(inT, coeffT, biasArr, outT, param) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE4D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE4D_GET_DIM4(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + uint16_t dilatedKW_MOW = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM1(coeffTile) - 1) + 1); \ + uint16_t dilatedKH_MOW = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \ + if (dilatedKW_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1) \ + + (dilatedKW_MOW >> 1) - dilatedKW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1) \ + + ((dilatedKW_MOW >> 1) - 1) - dilatedKW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedKH_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1) \ + + (dilatedKH_MOW >> 1) - dilatedKH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1) \ + + ((dilatedKH_MOW >> 1) - 1) - dilatedKH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent.."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(outT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of Kernels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +/* outT is assumed to be ID16WH */ +/* inT is assumed to be DWH */ +/* coeffT is assumed to be RMOD_DWH_ID16WH */ +#if (XCHAL_IVPN_SIMD_WIDTH == 64) +#define XAI_CHECK_CONSISTENCY_MOD_DWH_ID16WH(inT, coeffT, biasArr, outT, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 4) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) \ + - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_RMOD_DWH_I16_ID16WH) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1(coeffT) >> 4) == 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels in the kernel after zero padding (if any) should be 4."); \ + } \ + else if (XAI_TILE4D_GET_DATA_ORDER(coeffT) == XAI_RMOD_DWH_ID16WH) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1(coeffT) >> 5) == 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels in the kernel after zero padding (if any) should be 4."); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) <= 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels should be less than equal to 4."); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) == (XAI_TILE3D_GET_DIM2(outT) << 4)), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than or equal to the number of output channels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + } + +#else +#define XAI_CHECK_CONSISTENCY_MOD_DWH_ID16WH(inT, coeffT, biasArr, outT, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 4) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) \ + - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= (((XAI_TILE3D_GET_DIM2(inT) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1(coeffT) >> 4) == 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels in the kernel after zero padding (if any) should be 4."); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) <= 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels should be less than equal to 4."); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) == (XAI_TILE3D_GET_DIM2(outT) << 4)), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than or equal to the number of output channels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + } +#endif + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_MOW_WHD(inT, coeffT, biasArr, outT, param) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param) == \ + XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + uint16_t dilatedKW_MOW = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) * \ + (XAI_TILE3D_GET_DIM1(coeffTile) - 1) + 1); \ + uint16_t dilatedKH_MOW = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) * \ + (XAI_TILE3D_GET_DIM2(coeffTile) - 1) + 1); \ + if (dilatedKW_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1) \ + + (dilatedKW_MOW >> 1) - dilatedKW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (dilatedKW_MOW >> 1) \ + + ((dilatedKW_MOW >> 1) - 1) - dilatedKW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) \ + + 1)), XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedKH_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1) \ + + (dilatedKH_MOW >> 1) - dilatedKH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKH_MOW >> 1) \ + + ((dilatedKH_MOW >> 1) - 1) - dilatedKH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) \ + + 1)), XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(outT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of Kernels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_CONSISTENCY_SO_DWH(inT, coeffT, biasArr, outT, param) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE4D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE4D_GET_DIM4(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + uint16_t dilatedKW_SO = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \ + uint16_t dilatedKH_SO = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + if (dilatedKW_SO % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_SO >> 1) \ + + (dilatedKW_SO >> 1) - dilatedKW_SO) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (dilatedKW_SO >> 1) \ + + ((dilatedKW_SO >> 1) - 1) - dilatedKW_SO) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedKH_SO % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_SO >> 1) \ + + (dilatedKH_SO >> 1) - dilatedKH_SO) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedKH_SO >> 1) \ + + ((dilatedKH_SO >> 1) - 1) - dilatedKH_SO) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE4D_GET_DIM4(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of Kernels."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_COEFFTILE_CONTIGUOUS(coeffT, param) \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1_PITCH(coeffT) == XAI_TILE4D_GET_DIM1(coeffT)) && \ + (XAI_TILE4D_GET_DIM2_PITCH(coeffT) == XAI_TILE4D_GET_DIM1(coeffT) * \ + XAI_TILE4D_GET_DIM2(coeffT)), XAI_ERR_BADARG, \ + "CoeffTile is not contiguous."); + +#define XAI_CHECK_CONSISTENCY_POOL_WHD(inT, outT, param) \ + XAI_CHECK_ERROR((XAI_CNN_POOLING_GET_STRIDEX(param) > 0) && (XAI_CNN_POOLING_GET_STRIDEY(param) > 0), \ + XAI_ERR_BADARG, "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height must be greater than 0", \ + XAI_CNN_POOLING_GET_STRIDEX(param), XAI_CNN_POOLING_GET_STRIDEY(param)); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE3D_GET_DIM3(outT), \ + XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match"); \ + if (XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset"); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset"); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } + +#define XAI_CHECK_CONSISTENCY_POOL_DWH(inT, outT, param) \ + XAI_CHECK_ERROR((XAI_CNN_POOLING_GET_STRIDEX(param) > 0) && (XAI_CNN_POOLING_GET_STRIDEY(param) > 0), \ + XAI_ERR_BADARG, "\nStrideX = %hhu, StrideY = %hhu\nStride along width and height must be greater than 0", \ + XAI_CNN_POOLING_GET_STRIDEX(param), XAI_CNN_POOLING_GET_STRIDEY(param)); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(outT), \ + XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match"); \ + if ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset"); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset"); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } + +#define XAI_CHECK_CONSISTENCY_POOL_ID32WH(inT, outT, param) \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 5) == (XAI_TILE3D_GET_DIM2(outT) << 5), \ + XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match"); \ + if ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 5) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 5) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) && \ + (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 5) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset"); \ + } \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset"); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } +#define XAI_CHECK_CONSISTENCY_POOL_ID16WH(inT, outT, param) \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 4) == (XAI_TILE3D_GET_DIM2(outT) << 4), \ + XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match"); \ + if ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 4) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inT) >> 4) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) && \ + (XAI_TILE3D_GET_DIM1_EDGE2(inT) >> 4) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset"); \ + } \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) + ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELWIDTH(param) >> 1) - XAI_CNN_POOLING_GET_KERNELWIDTH(param)) / (XAI_CNN_POOLING_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset"); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - 1) \ + + (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) >> 1) - XAI_CNN_POOLING_GET_KERNELHEIGHT(param)) / (XAI_CNN_POOLING_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } +#define XAI_CHECK_CONSISTENCY_UNPOOL_WHD(inT, outT, param) \ + /* Width & Height Divisible by stride */ \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEX(param)) == 0), \ + XAI_ERR_DATASIZE, "Number of output widths to be generated should be a multiple of strideX"); \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM2(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEY(param)) == 0), \ + XAI_ERR_DATASIZE, "Number of output heights to be generated should be a multiple of strideY"); \ + \ + /* Depth Should be same for in and out tiles */ \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE3D_GET_DIM3(outT), \ + XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match"); \ + \ + /* Minimum required input width to compute requested output width */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) >= ((XAI_TILE3D_GET_DIM1(outT) - 1) / \ + XAI_CNN_POOLING_GET_STRIDEX(param)) + 1), XAI_ERR_DATASIZE, \ + "Insufficient input width to generate requested output width"); \ + \ + /* Minimum required input height to compute requested output height */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) >= ((XAI_TILE3D_GET_DIM2(outT) - 1) / \ + XAI_CNN_POOLING_GET_STRIDEY(param)) + 1), XAI_ERR_DATASIZE, \ + "Insufficient input height to generate requested output height"); \ + \ + if (XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0) \ + { \ + /* Odd Width Kernel Edge Consistency */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid left/right edge for odd kernel width."); \ + } \ + else \ + { \ + /* Even Width Kernel Edge Consistency */ \ + if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag reset"); \ + } \ + } \ + if (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0) \ + { \ + /* Odd Height Kernel Edge Consistency */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid Top/Bottom edge for odd kernel height."); \ + } \ + else \ + { \ + /* Even Height Kernel Edge Consistency */ \ + if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag reset"); \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_UNPOOL_DWH(inT, outT, param) \ + /* Width & Height Divisible by stride */ \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM2(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEX(param)) == 0), \ + XAI_ERR_DATASIZE, "Number of output widths to be generated should be a multiple of strideX"); \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM3(outT) - 1) % XAI_CNN_POOLING_GET_STRIDEY(param)) == 0), \ + XAI_ERR_DATASIZE, "Number of output heights to be generated should be a multiple of strideY"); \ + \ + /* Depth Should be same for in and out tiles */ \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(outT), \ + XAI_ERR_CHANNEL_INVALID, "Number of input and output channels don't match"); \ + \ + /* Minimum required input width to compute requested output width */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) >= ((XAI_TILE3D_GET_DIM2(outT) - 1) / \ + XAI_CNN_POOLING_GET_STRIDEX(param)) + 1), XAI_ERR_DATASIZE, \ + "Insufficient input width to generate requested output width"); \ + \ + /* Minimum required input height to compute requested output height */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(inT) >= ((XAI_TILE3D_GET_DIM3(outT) - 1) / \ + XAI_CNN_POOLING_GET_STRIDEY(param)) + 1), XAI_ERR_DATASIZE, \ + "Insufficient input height to generate requested output height"); \ + \ + if (XAI_CNN_POOLING_GET_KERNELWIDTH(param) % 2 != 0) \ + { \ + /* Odd Width Kernel Edge Consistency */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid left/right edge for odd kernel width."); \ + } \ + else \ + { \ + /* Even Width Kernel Edge Consistency */ \ + if (XAI_CNN_POOLING_GET_LEFTEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELWIDTH(param) / 2)), \ + XAI_ERR_EDGE, "Invalid left/right edge for even kernel width with leftedge flag reset"); \ + } \ + } \ + if (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) % 2 != 0) \ + { \ + /* Odd Height Kernel Edge Consistency */ \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid Top/Bottom edge for odd kernel height."); \ + } \ + else \ + { \ + /* Even Height Kernel Edge Consistency */ \ + if (XAI_CNN_POOLING_GET_TOPEDGE_FLAG(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1)), \ + XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(outT) >= ((XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(outT) >= (XAI_CNN_POOLING_GET_KERNELHEIGHT(param) / 2)), \ + XAI_ERR_EDGE, "Invalid top/bottom edge for even kernel height with topedge flag reset"); \ + } \ + } + +#define XAI_CHECK_EDGES_MOW_WHD(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM1(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATION(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2 - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +/* outT is assumed to be ID4WH */ +#define XAI_CHECK_CONSISTENCY_MOD_ID4WH(inT, coeffT, biasArr, outT, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM2(outT) << 2) + 15) & (~15)) == (XAI_TILE4D_GET_DIM1(coeffT) >> 2), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) >= (XAI_TILE3D_GET_DIM2(outT) << 2)), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of output channels."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 2) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 2) + \ + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 2) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 2) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 2) == (XAI_TILE4D_GET_DIM4(coeffT) << 2), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + } + +/* outT is assumed to be ID16WH */ +#define XAI_CHECK_CONSISTENCY_MOD_ID16WH(inT, coeffT, biasArr, outT, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 4) == (XAI_TILE4D_GET_DIM4(coeffT) << 4), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 16) >= (XAI_TILE3D_GET_DIM2(outT) << 4)), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of output channels."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) + \ + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 4) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 4) == (XAI_TILE4D_GET_DIM1(coeffT) >> 4), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + } + +#define XAI_CHECK_CONSISTENCY_MOD_ID32WH(inT, coeffT, biasArr, outT, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) << 5) == (XAI_TILE4D_GET_DIM4(coeffT) << 5), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of Kernels."); \ + XAI_CHECK_ERROR((XAI_ALIGN_VAL(XAI_ARRAY_GET_WIDTH(biasArr), 32) >= (XAI_TILE3D_GET_DIM2(outT) << 5)), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of output channels."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) + \ + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outT) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inT) >> 5) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(inT) << 5) == (XAI_TILE4D_GET_DIM1(coeffT) >> 5), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + (dilatedkHeight >> 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (dilatedkHeight >> 1) + ((dilatedkHeight >> 1) - 1) \ + - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + } +// Assuming that "inTile" is in DWH format +// Assuming that "coeffTile" is in RMOD_DWH_ID16WH format +#define XAI_CHECK_EDGES_MOD_DWH_ID16WH(inTile, coeffTile, param) \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1; \ + if (dilatedkWidth % 2 != 0) \ + { \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedkHeight / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedkHeight / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedkHeight % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedkWidth / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedkWidth / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedkWidth / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedkHeight / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedkHeight / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedkWidth / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedkWidth / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedkHeight / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedkHeight / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedkWidth / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedkWidth / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedkHeight / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedkHeight / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedkWidth / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedkWidth / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedkHeight / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedkHeight / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedkWidth / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedkWidth / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedkHeight / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedkHeight / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } +#define XAI_CHECK_EDGES_DEPTHWISE_DILATED_MOW_WHD(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) * \ + (XAI_TILE3D_GET_DIM1(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATION(param) * \ + (XAI_TILE3D_GET_DIM2(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2 - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_EDGES_MOD_WHD(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + + +#define XAI_CHECK_EDGES_MOD_DWH(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_EDGES_MOD_WHD_DWH(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_TILES3D_CHECK_EDGES_QUANT(inTile, outTile) \ + { \ + if (XAI_TILE3D_GET_DATA_PTR(inTile) == XAI_TILE3D_GET_DATA_PTR(outTile)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(outTile) + XAI_TILE3D_GET_DIM1_EDGE2(outTile)) <= \ + (2 * (XAI_TILE3D_GET_DIM1_PITCH(inTile) - XAI_TILE3D_GET_DIM1(inTile)))), XAI_ERR_BADARG, \ + "Output and Input tile edges constraints have not been met along dimension 1"); \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_PITCH(outTile) * (XAI_TILE3D_GET_DIM2_EDGE1(outTile) + XAI_TILE3D_GET_DIM2_EDGE2(outTile))) <= \ + (2 * (XAI_TILE3D_GET_DIM2_PITCH(inTile) - (XAI_TILE3D_GET_DIM1_PITCH(inTile) * XAI_TILE3D_GET_DIM2(inTile))))), \ + XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met along dimension 2"); \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM3_EDGE1(outTile) + XAI_TILE3D_GET_DIM3_EDGE2(outTile)) <= \ + (2 * (XAI_TILE3D_GET_DIM3_EDGE1(inTile) + XAI_TILE3D_GET_DIM3_EDGE2(inTile)))), XAI_ERR_BADARG, \ + "Output and Input tile edges constraints have not been met along dimension 3"); \ + XAI_CHECK_ERROR(((size_t) (XAI_TILE3D_GET_BUFF_PTR(inTile)) <= ((size_t) (XAI_TILE3D_GET_BUFF_PTR(outTile)))), XAI_ERR_BADARG, \ + "Output tile buffer pointer should be greater than or equal to input tile buffer pointer"); \ + } \ + } +#define XAI_CHECK_TILES4D_CHECK_EDGES_QUANT(inTile, outTile) \ + { \ + if (XAI_TILE4D_GET_DATA_PTR(inTile) == XAI_TILE4D_GET_DATA_PTR(outTile)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_EDGE1(outTile) + XAI_TILE4D_GET_DIM1_EDGE2(outTile)) <= \ + (2 * (XAI_TILE4D_GET_DIM1_PITCH(inTile) - XAI_TILE4D_GET_DIM1(inTile)))), XAI_ERR_BADARG, \ + "Output and Input tile edges constraints have not been met along dimension 1"); \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_PITCH(outTile) * (XAI_TILE4D_GET_DIM2_EDGE1(outTile) + XAI_TILE4D_GET_DIM2_EDGE2(outTile))) <= \ + (2 * (XAI_TILE4D_GET_DIM2_PITCH(inTile) - (XAI_TILE3D_GET_DIM1_PITCH(inTile) * XAI_TILE4D_GET_DIM2(inTile))))), \ + XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met along dimension 2"); \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM2_PITCH(outTile) * (XAI_TILE4D_GET_DIM3_EDGE1(outTile) + XAI_TILE4D_GET_DIM3_EDGE2(outTile))) <= \ + (2 * (XAI_TILE4D_GET_DIM3_PITCH(inTile) - (XAI_TILE3D_GET_DIM2_PITCH(inTile) * XAI_TILE4D_GET_DIM3(inTile))))), \ + XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met along dimension 3"); \ + XAI_CHECK_ERROR(((size_t) (XAI_TILE4D_GET_BUFF_PTR(inTile)) <= ((size_t) (XAI_TILE4D_GET_BUFF_PTR(outTile)))), XAI_ERR_BADARG, \ + "Output tile buffer pointer should be greater than or equal to input tile buffer pointer"); \ + } \ + } +#define XAI_CHECK_TILES3D_CHECK_EDGES_DEQUANT(inTile, outTile) \ + { \ + if (XAI_TILE3D_GET_DATA_PTR(inTile) == XAI_TILE3D_GET_DATA_PTR(outTile)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) + XAI_TILE3D_GET_DIM1_EDGE2(inTile)) <= \ + (2 * (XAI_TILE3D_GET_DIM1_PITCH(outTile) - XAI_TILE3D_GET_DIM1(outTile)))), XAI_ERR_BADARG, \ + "Output and Input tile edges constraints have not been met along dimension 1"); \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_PITCH(inTile) * (XAI_TILE3D_GET_DIM2_EDGE1(inTile) + XAI_TILE3D_GET_DIM2_EDGE2(inTile))) <= \ + (2 * (XAI_TILE3D_GET_DIM2_PITCH(outTile) - (XAI_TILE3D_GET_DIM1_PITCH(outTile) * XAI_TILE3D_GET_DIM2(outTile))))), \ + XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met along dimension 2"); \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM3_EDGE1(inTile) + XAI_TILE3D_GET_DIM3_EDGE2(inTile)) <= \ + (2 * (XAI_TILE3D_GET_DIM3_EDGE1(outTile) + XAI_TILE3D_GET_DIM3_EDGE2(outTile)))), XAI_ERR_BADARG, \ + "Output and Input tile edges constraints have not been met along dimension 3"); \ + XAI_CHECK_ERROR(((size_t) (XAI_TILE3D_GET_BUFF_PTR(outTile)) <= ((size_t) (XAI_TILE3D_GET_BUFF_PTR(inTile)))), XAI_ERR_BADARG, \ + "Input tile buffer pointer should be greater than or equal to output tile buffer pointer"); \ + } \ + } +#define XAI_CHECK_TILES4D_CHECK_EDGES_DEQUANT(inTile, outTile) \ + { \ + if (XAI_TILE4D_GET_DATA_PTR(inTile) == XAI_TILE4D_GET_DATA_PTR(outTile)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_EDGE1(inTile) + XAI_TILE4D_GET_DIM1_EDGE2(inTile)) <= \ + (2 * (XAI_TILE4D_GET_DIM1_PITCH(outTile) - XAI_TILE4D_GET_DIM1(outTile)))), XAI_ERR_BADARG, \ + "Output and Input tile edges constraints have not been met along dimension 1"); \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM1_PITCH(inTile) * (XAI_TILE4D_GET_DIM2_EDGE1(inTile) + XAI_TILE4D_GET_DIM2_EDGE2(inTile))) <= \ + (2 * (XAI_TILE4D_GET_DIM2_PITCH(outTile) - (XAI_TILE3D_GET_DIM1_PITCH(outTile) * XAI_TILE4D_GET_DIM2(outTile))))), \ + XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met along dimension 2"); \ + XAI_CHECK_ERROR(((XAI_TILE4D_GET_DIM2_PITCH(inTile) * (XAI_TILE4D_GET_DIM3_EDGE1(inTile) + XAI_TILE4D_GET_DIM3_EDGE2(inTile))) <= \ + (2 * (XAI_TILE4D_GET_DIM3_PITCH(outTile) - (XAI_TILE3D_GET_DIM2_PITCH(outTile) * XAI_TILE4D_GET_DIM3(outTile))))), \ + XAI_ERR_BADARG, "Output and Input tile edges constraints have not been met along dimension 3"); \ + XAI_CHECK_ERROR(((size_t) (XAI_TILE4D_GET_BUFF_PTR(outTile)) <= ((size_t) (XAI_TILE4D_GET_BUFF_PTR(inTile)))), XAI_ERR_BADARG, \ + "Input tile buffer pointer should be greater than or equal to output tile buffer pointer"); \ + } \ + } + +#define XAI_CHECK_EDGES_MOD_DWH_IN16DWH(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) +#define XAI_CHECK_EDGES_F16_MOD_DWH(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_EDGES_DEPTHWISE_F16_MOD_DWH(inTile, coeffTile, param) \ + int32_t kW = XAI_TILE3D_GET_DIM2(coeffTile); \ + int32_t kH = XAI_TILE3D_GET_DIM3(coeffTile); \ + if (kW % 2 != 0) \ + { \ + if (kH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_F16_MOD_DWH(inT, coeffT, biasArr, outT, param) \ + int32_t KW_MOD = XAI_TILE3D_GET_DIM2(coeffT); \ + int32_t KH_MOD = XAI_TILE3D_GET_DIM3(coeffT); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (KH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); +#define XAI_CHECK_CONV_RELU_LIMITS_F16(param, outTile) { \ + if (XAI_CNN_CONV_GET_FLAG_RELU(param)) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT(param) <= XAI_CNN_CONV_GET_RELU_MAX_FLT(param)), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %f,\nMaximum Value of RELU = %f , Min Limit should not be greater than Max Limit", \ + CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MIN_FLT(param)), CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MAX_FLT(param))); \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT(param) >= XAI_F16_MIN && \ + XAI_CNN_CONV_GET_RELU_MAX_FLT(param) <= XAI_F16_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %f, value should be greater than or equal to XAI_F16_MIN \nMaximum Value of RELU = %f, value should be less than or equal to XAI_F16_MAX", \ + CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MIN_FLT(param)), CONVERT_FP16_TO_FP32(XAI_CNN_CONV_GET_RELU_MAX_FLT(param))); \ + } \ +} +#endif //if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + +#if (XCHAL_HAVE_VISION_SP_VFPU == 1 || XCHAL_HAVE_BBENEP_SP_VFPU == 1 || defined(XAI_REF_ONLY_COMPILATION)) +#define XAI_CHECK_EDGES_F32_MOD_DWH(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM4(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (dilatedKH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= dilatedKW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((dilatedKH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((dilatedKW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((dilatedKH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } +#define XAI_CHECK_EDGES_DEPTHWISE_F32_MOD_DWH(inTile, coeffTile, param) \ + int32_t kW = XAI_TILE3D_GET_DIM2(coeffTile); \ + int32_t kH = XAI_TILE3D_GET_DIM3(coeffTile); \ + if (kW % 2 != 0) \ + { \ + if (kH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_F32_MOD_DWH(inT, coeffT, biasArr, outT, param) \ + int32_t KW_MOD = XAI_TILE3D_GET_DIM2(coeffT); \ + int32_t KH_MOD = XAI_TILE3D_GET_DIM3(coeffT); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (KH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); +#define XAI_CHECK_CONV_RELU_LIMITS_F32(param, outTile) { \ + if (XAI_CNN_CONV_GET_FLAG_RELU(param)) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT32(param) <= XAI_CNN_CONV_GET_RELU_MAX_FLT32(param)), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %f,\nMaximum Value of RELU = %f , Min Limit should not be greater than Max Limit", \ + XAI_CNN_CONV_GET_RELU_MIN_FLT32(param), XAI_CNN_CONV_GET_RELU_MAX_FLT32(param)); \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN_FLT32(param) >= XAI_F32_MIN_FLT && \ + XAI_CNN_CONV_GET_RELU_MAX_FLT32(param) <= XAI_F32_MAX_FLT), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %f, value should be greater than or equal to XAI_F32_MIN_FLT \nMaximum Value of RELU = %f, value should be less than or equal to XAI_F32_MAX_FLT", \ + XAI_CNN_CONV_GET_RELU_MIN_FLT32(param), XAI_CNN_CONV_GET_RELU_MAX_FLT32(param)); \ + } \ +} +#endif //if (XCHAL_HAVE_VISION_SP_VFPU == 1 || XCHAL_HAVE_BBENEP_SP_VFPU == 1 || defined(XAI_REF_ONLY_COMPILATION)) + +#define XAI_CHECK_EDGES_SO(inTile, coeffTile, param) \ + uint16_t dilatedKW = (uint16_t) (XAI_CNN_CONV_GET_DILATIONX(param) * (XAI_TILE4D_GET_DIM2(coeffTile) - 1) + 1); \ + uint16_t dilatedKH = (uint16_t) (XAI_CNN_CONV_GET_DILATIONY(param) * (XAI_TILE4D_GET_DIM3(coeffTile) - 1) + 1); \ + if (dilatedKW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2)) && \ + (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2)), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2)) && \ + (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2 - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (dilatedKW / 2 - 1)) && \ + (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (dilatedKW / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with left edge flag reset"); \ + } \ + } \ + if (dilatedKH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= dilatedKH / 2) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= dilatedKH / 2), \ + XAI_ERR_EDGE, "Invalid edge for odd kernel size"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2 - 1)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag set"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (dilatedKH / 2 - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (dilatedKH / 2)), \ + XAI_ERR_EDGE, "Invalid edge for even kernel size with top edge flag reset"); \ + } \ + } \ + +#define XAI_CHECK_EDGES_MOD_ID16WH(inTile, coeffT, param) \ + int32_t kWidthMOD, kHeightMOD; \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + kWidthMOD = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + kHeightMOD = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + if (kWidthMOD % 2 != 0) \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_EDGES_MOD_ID32WH(inTile, coeffT, param) \ + int32_t kWidthMOD, kHeightMOD; \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + kWidthMOD = dilationX * (XAI_TILE4D_GET_DIM3(coeffT) - 1) + 1; \ + kHeightMOD = dilationY * (XAI_TILE4D_GET_DIM2(coeffT) - 1) + 1; \ + if (kWidthMOD % 2 != 0) \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } +#define XAI_CHECK_EDGES_DEPTHWISE_MOD_ID16WH(inTile, coeffT, param) \ + int32_t kWidthMOD, kHeightMOD; \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + kWidthMOD = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1; \ + kHeightMOD = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1; \ + if (kWidthMOD % 2 != 0) \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 4) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 4) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOD_ID16WH(inTile, coeffT, outTile, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM2(inTile) << 4) == (XAI_TILE3D_GET_DIM2(outTile) << 4)), \ + XAI_ERR_DATASIZE, "Number of input and output channel should be equal."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 4) + \ + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 4) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 4) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) + \ + (dilatedkHeight >> 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) + \ + ((dilatedkHeight >> 1) - 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + } + +#define XAI_CHECK_EDGES_DEPTHWISE_MOD_ID32WH(inTile, coeffT, param) { \ + int32_t kWidthMOD, kHeightMOD; \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + kWidthMOD = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1; \ + kHeightMOD = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1; \ + if (kWidthMOD % 2 != 0) \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kHeightMOD % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= kWidthMOD / 2) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) \ + && ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= kWidthMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kHeightMOD / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kHeightMOD / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kHeightMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kHeightMOD / 2) - 1))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= (kWidthMOD / 2)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >> 5) >= ((kWidthMOD / 2) - 1)) && \ + ((XAI_TILE3D_GET_DIM1_EDGE2(inTile) >> 5) >= (kWidthMOD / 2)) && \ + (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kHeightMOD / 2) - 1)) && \ + (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kHeightMOD / 2))), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } \ +} +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOD_ID32WH(inTile, coeffT, outTile, param) \ + { \ + uint16_t dilationX = XAI_CNN_CONV_GET_DILATIONX(param); \ + uint16_t dilationY = XAI_CNN_CONV_GET_DILATIONY(param); \ + int32_t dilatedkWidth = dilationX * (XAI_TILE3D_GET_DIM2(coeffT) - 1) + 1; \ + int32_t dilatedkHeight = dilationY * (XAI_TILE3D_GET_DIM3(coeffT) - 1) + 1; \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM2(inTile) << 5) == (XAI_TILE3D_GET_DIM2(outTile) << 5)), \ + XAI_ERR_DATASIZE, "Number of input and output channel should be equal."); \ + if (dilatedkWidth % 2 != 0) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 5) + \ + (dilatedkWidth >> 1) + (dilatedkWidth >> 1) - dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DIM1(outTile) >> 5) <= ((((XAI_TILE3D_GET_DIM1(inTile) >> 5) + \ + (dilatedkWidth >> 1) + ((dilatedkWidth >> 1) - 1) - \ + dilatedkWidth) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (dilatedkHeight % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) + \ + (dilatedkHeight >> 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outTile) <= (((XAI_TILE3D_GET_DIM3(inTile) + (dilatedkHeight >> 1) + \ + ((dilatedkHeight >> 1) - 1) - dilatedkHeight) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + } + +#define XAI_CHECK_CONV_RELU_LIMITS_IX(param, outTile) { \ + if (XAI_CNN_CONV_GET_FLAG_RELU(param)) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) <= XAI_CNN_CONV_GET_RELU_MAX(param)), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d,\nMaximum Value of RELU = %d , Min Limit should not be greater than Max Limit", \ + XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param)); \ + if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U8) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= 0 && \ + XAI_CNN_CONV_GET_RELU_MAX(param) <= UCHAR_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d, value should be less than or equal to 255", \ + XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param)); \ + } \ + else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S8) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= SCHAR_MIN && \ + XAI_CNN_CONV_GET_RELU_MAX(param) <= SCHAR_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to -128 \nMaximum Value of RELU = %d, value should be less than or equal to 127", \ + XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param)); \ + } \ + else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S16) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= SHRT_MIN && \ + XAI_CNN_CONV_GET_RELU_MAX(param) <= SHRT_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to -32768 \nMaximum Value of RELU = %d, value should be less than or equal to 32767", \ + XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param)); \ + } \ + else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U16) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_CONV_GET_RELU_MIN(param) >= 0 && \ + XAI_CNN_CONV_GET_RELU_MAX(param) <= USHRT_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d, value should be less than or equal to 65535", \ + XAI_CNN_CONV_GET_RELU_MIN(param), XAI_CNN_CONV_GET_RELU_MAX(param)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(0, XAI_ERR_NO_VARIANT, "Output tile datatype is not supported by XAI_CHECK_CONV_RELU_LIMITS_IX"); \ + } \ + } \ +} + +#define XAI_CHECK_DEPTHWISE_DILATED_CONV_RELU_LIMITS_IX(param, outTile) { \ + if (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_FLAG_RELU(param)) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) <= XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)), \ + XAI_ERR_BADARG, "\nMinimum Value of RELU = %d,\nMaximum Value of RELU = %d , Min Limit should not be greater than Max Limit", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)); \ + if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U8) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= 0 && \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= UCHAR_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d," \ + "value should be less than or equal to 255", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)); \ + } \ + else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S8) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= SCHAR_MIN && \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= SCHAR_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to -128 \nMaximum Value of RELU = %d," \ + "value should be less than or equal to 127", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)); \ + } \ + else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_S16) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= SHRT_MIN && \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= SHRT_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to -32768 \nMaximum Value of RELU = %d," \ + "value should be less than or equal to 32767", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)); \ + } \ + else if (XAI_TYPE_ELEMENT_TYPE(outTile->type) == XAI_U16) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param) >= 0 && \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param) <= USHRT_MAX), XAI_ERR_BADARG, \ + "\nMinimum Value of RELU = %d, value should be greater than or equal to 0 \nMaximum Value of RELU = %d," \ + "value should be less than or equal to 65535", \ + XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MIN(param), XAI_CNN_DEPTHWISE_DILATED_CONV_GET_RELU_MAX(param)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(0, XAI_ERR_NO_VARIANT, "Output tile datatype is not supported by XAI_CHECK_DEPTHWISE_DILATED_CONV_RELU_LIMITS_IX"); \ + } \ + } \ +} + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOD_DWH(inT, coeffT, biasArr, outT, param) { \ + int32_t KW_MOD = XAI_TILE3D_GET_DIM2(coeffT); \ + int32_t KH_MOD = XAI_TILE3D_GET_DIM3(coeffT); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (KH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ +} + +#if (((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + #define XAI_CHECK_CONSISTENCY_F16_MOD_DWH(inT, coeffT, biasArr, outT, param) { \ + int32_t KW_MOD = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_CONV_GET_DILATIONX(param) + 1; \ + int32_t KH_MOD = (XAI_TILE3D_GET_DIM3(coeffT) - 1) * XAI_CNN_CONV_GET_DILATIONY(param) + 1; \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inT) == XAI_TILE3D_GET_DIM2(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + (KW_MOD >> 1) - KW_MOD) >> (XAI_CNN_CONV_GET_STRIDEX(param) >> 1)) + 1)), \ + XAI_ERR_DATASIZE, "Output Width is invalid."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + ((KW_MOD >> 1) - 1) - KW_MOD) >> (XAI_CNN_CONV_GET_STRIDEX(param) >> 1)) + 1)), \ + XAI_ERR_DATASIZE, "Output Width is invalid."); \ + } \ + if (KH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + (KH_MOD >> 1) - KH_MOD) >> (XAI_CNN_CONV_GET_STRIDEY(param) >> 1)) + 1)), \ + XAI_ERR_DATASIZE, "Output Height is invalid."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + ((KH_MOD >> 1) - 1) - KH_MOD) >> (XAI_CNN_CONV_GET_STRIDEY(param) >> 1)) + 1)), \ + XAI_ERR_DATASIZE, "Output Height is invalid."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ +} +#endif //if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (defined(__clang__) && defined(XAI_REF_ONLY_COMPILATION))) + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_MOD_DWH(inT, coeffT, biasArr, outT, param) \ + int32_t KW_MOD = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1; \ + int32_t KH_MOD = (XAI_TILE3D_GET_DIM3(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1; \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param)) \ + == XAI_TILE3D_GET_DIM1(coeffT), \ + XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (KH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_VQ_MOD_DWH(inT, coeffT, biasArr, outputScaleArray, outT, param) \ + int32_t KW_MOD = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1; \ + int32_t KH_MOD = (XAI_TILE3D_GET_DIM3(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1; \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param)) \ + == XAI_TILE3D_GET_DIM1(coeffT), \ + XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outT) == XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + (KW_MOD >> 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KW_MOD >> 1) \ + + ((KW_MOD >> 1) - 1) - KW_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEX(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (KH_MOD % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + (KH_MOD >> 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM3(outT) <= (((XAI_TILE3D_GET_DIM3(inT) + (KH_MOD >> 1) \ + + ((KH_MOD >> 1) - 1) - KH_MOD) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDEY(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE3D_GET_DIM1(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(outputScaleArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_DILATED_VQ_MOW_WHD(inT, coeffT, biasArr, outputScaleArray, outT, param) \ + int32_t KW_MOW = (XAI_TILE3D_GET_DIM1(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1; \ + int32_t KH_MOW = (XAI_TILE3D_GET_DIM2(coeffT) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1; \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DEPTH_MULTIPLIER(param) \ + == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + if (KW_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (KW_MOW >> 1) \ + + (KW_MOW >> 1) - KW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (KW_MOW >> 1) \ + + ((KW_MOW >> 1) - 1) - KW_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (KH_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KH_MOW >> 1) \ + + (KH_MOW >> 1) - KH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (KH_MOW >> 1) \ + + ((KH_MOW >> 1) - 1) - KH_MOW) / (XAI_CNN_DEPTHWISE_DILATED_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(outputScaleArray) >= XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(outputScaleArray) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_CONSISTENCY_DEPTHWISE_MOW_WHD(inT, coeffT, biasArr, outT, param) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Input Channels not equal to the number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outT) == XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Number of Output Channels not equal to the number of channels in the Kernel."); \ + int32_t kW_MOW = XAI_TILE3D_GET_DIM1(coeffT); \ + int32_t kH_MOW = XAI_TILE3D_GET_DIM2(coeffT); \ + if (kW_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (kW_MOW >> 1) + \ + (kW_MOW >> 1) - kW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(outT) <= (((XAI_TILE3D_GET_DIM1(inT) + (kW_MOW >> 1) + \ + ((kW_MOW >> 1) - 1) - kW_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile widths are inconsistent."); \ + } \ + if (kH_MOW % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (kH_MOW >> 1) + \ + (kH_MOW >> 1) - kH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(outT) <= (((XAI_TILE3D_GET_DIM2(inT) + (kH_MOW >> 1) + \ + ((kH_MOW >> 1) - 1) - kH_MOW) / (XAI_CNN_CONV_GET_STRIDE(param))) + 1)), \ + XAI_ERR_DATASIZE, "Input and Output tile heights are inconsistent."); \ + } \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_WIDTH(biasArr) >= XAI_TILE3D_GET_DIM3(coeffT), XAI_ERR_DATASIZE, \ + "Width of Bias Array is less than number of channels in the Kernel."); \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(biasArr) > 0, XAI_ERR_DATASIZE, \ + "Height of Bias Array should be greater than zero."); + +#define XAI_CHECK_KERNEL_SIZE_DEPTHWISE(coeffT, size) \ + if (XAI_TILE3D_GET_DATA_ORDER(coeffT) == XAI_WHD) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1(coeffT) == size) && (XAI_TILE3D_GET_DIM2(coeffT) == size), \ + XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported"); \ + } \ + else if (XAI_TILE3D_GET_DATA_ORDER(coeffT) == XAI_DWH) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2(coeffT) == size) && (XAI_TILE3D_GET_DIM3(coeffT) == size), \ + XAI_ERR_KSIZE, "The Coefficient Kernel Size is not supported"); \ + } + +#define XAI_CHECK_EDGES_DEPTHWISE_MOW_WHD(inTile, coeffTile, param) \ + int32_t kW = XAI_TILE3D_GET_DIM1(coeffTile); \ + int32_t kH = XAI_TILE3D_GET_DIM2(coeffTile); \ + if (kW % 2 != 0) \ + { \ + if (kH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((kW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((kW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (kW / 2 - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM1_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM1_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_EDGES_DEPTHWISE_MOD_DWH(inTile, coeffTile, param) \ + int32_t kW = XAI_TILE3D_GET_DIM2(coeffTile); \ + int32_t kH = XAI_TILE3D_GET_DIM3(coeffTile); \ + if (kW % 2 != 0) \ + { \ + if (kH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_EDGES_DEPTHWISE_DILATED_MOD_DWH(inTile, coeffTile, param) \ + int32_t kW = (XAI_TILE3D_GET_DIM2(coeffTile) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONX(param) + 1; \ + int32_t kH = (XAI_TILE3D_GET_DIM3(coeffTile) - 1) * XAI_CNN_DEPTHWISE_DILATED_CONV_GET_DILATIONY(param) + 1; \ + if (kW % 2 != 0) \ + { \ + if (kH % 2 != 0) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + else \ + { \ + if (kH % 2 != 0) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) - 1) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1)) \ + && (XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= kW / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= kH / 2) \ + && (XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= kH / 2), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_TOPEDGE(param)) \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= (kH / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= ((kH / 2) - 1)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + else \ + { \ + if (XAI_CNN_CONV_GET_FLAG_LEFTEDGE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE3D_GET_DIM2_EDGE1(inTile) >= ((kW / 2) - 1) && \ + XAI_TILE3D_GET_DIM2_EDGE2(inTile) >= (kW / 2) && \ + XAI_TILE3D_GET_DIM3_EDGE1(inTile) >= ((kH / 2) - 1) && \ + XAI_TILE3D_GET_DIM3_EDGE2(inTile) >= (kH / 2)), \ + XAI_ERR_EDGE, "The input Tile doesn't have the required Edge Data"); \ + } \ + } \ + } \ + } + +#define XAI_CHECK_ROI_POOLING_PARAMS(param) \ + XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEX(param) <= 32767) && (XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALEY(param) <= 32767)), \ + XAI_ERR_NORM, "spatialScaleX & spatialScaleY should be less than U15_MAX"); \ + XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SCALE(param) <= 32767) && (XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SCALE(param) <= 32767)), \ + XAI_ERR_NORM, "oneByPooledWidth & oneByPooledHeight should be less than U15_MAX"); \ + XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTX(param) < 32) && (XAI_CNN_ROI_POOLING_GET_SPATIAL_SCALE_SHIFTY(param) < 32)), \ + XAI_ERR_NORM, "spatialScaleShiftX & spatialScaleShiftY should be less than 32 (scalar shift value)"); \ + XAI_CHECK_ERROR(((XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_WIDTH_SHIFT(param) < 32) && (XAI_CNN_ROI_POOLING_GET_ONE_BY_POOLED_HEIGHT_SHIFT(param) < 32)), \ + XAI_ERR_NORM, "shiftPool should be less than 32 (scalar shift value)"); \ + +#define XAI_CHECK_REORG_PARAMS_DWH(inTile, outTile, params) \ + if (XAI_CNN_REORG_GET_REVERSE(params)) \ + { \ + XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(params) * XAI_CNN_REORG_GET_STRIDE(params) * \ + XAI_TILE3D_GET_DIM1(outTile) == XAI_TILE3D_GET_DIM1(inTile), \ + XAI_ERR_DATASIZE, "The depth dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTile) == XAI_TILE3D_GET_DIM2(inTile) * \ + XAI_CNN_REORG_GET_STRIDE(params), XAI_ERR_DATASIZE, \ + "The width dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) == XAI_TILE3D_GET_DIM3(inTile) * \ + XAI_CNN_REORG_GET_STRIDE(params), XAI_ERR_DATASIZE, \ + "The height dimension of inTile and outTile is inconsistent"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTile) == XAI_CNN_REORG_GET_STRIDE(params) * \ + XAI_CNN_REORG_GET_STRIDE(params) * XAI_TILE3D_GET_DIM1(inTile), \ + XAI_ERR_DATASIZE, "The depth dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTile) * XAI_CNN_REORG_GET_STRIDE(params) == \ + XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "The width dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) * XAI_CNN_REORG_GET_STRIDE(params) == \ + XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "The height dimension of inTile and outTile is inconsistent"); \ + } + +#define XAI_CHECK_REORG4D_PARAMS_WHDN(inTile, outTile, params) \ + if (XAI_CNN_REORG_GET_REVERSE(params)) \ + { \ + XAI_CHECK_ERROR(XAI_CNN_REORG4D_GET_STRIDEX(params) * XAI_CNN_REORG4D_GET_STRIDEY(params) * \ + XAI_TILE4D_GET_DIM4(outTile) == XAI_TILE4D_GET_DIM4(inTile), \ + XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(outTile) == XAI_TILE4D_GET_DIM1(inTile) * \ + XAI_CNN_REORG4D_GET_STRIDEX(params), XAI_ERR_DATASIZE, \ + "The width dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) == XAI_TILE4D_GET_DIM2(inTile) * \ + XAI_CNN_REORG4D_GET_STRIDEY(params), XAI_ERR_DATASIZE, \ + "The height dimension of inTile and outTile is inconsistent"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(outTile) == XAI_CNN_REORG4D_GET_STRIDEX(params) * \ + XAI_CNN_REORG4D_GET_STRIDEY(params) * XAI_TILE4D_GET_DIM4(inTile), \ + XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(outTile) * XAI_CNN_REORG4D_GET_STRIDEX(params) == \ + XAI_TILE4D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "The width dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) * XAI_CNN_REORG4D_GET_STRIDEY(params) == \ + XAI_TILE4D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "The height dimension of inTile and outTile is inconsistent"); \ + } +#define XAI_CHECK_REORG4D_PARAMS_DWHN(inTile, outTile, params) \ + if (XAI_CNN_REORG_GET_REVERSE(params)) \ + { \ + XAI_CHECK_ERROR(XAI_CNN_REORG4D_GET_STRIDEX(params) * XAI_CNN_REORG4D_GET_STRIDEY(params) * \ + XAI_TILE4D_GET_DIM4(outTile) == XAI_TILE4D_GET_DIM4(inTile), \ + XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) == XAI_TILE4D_GET_DIM2(inTile) * \ + XAI_CNN_REORG4D_GET_STRIDEX(params), XAI_ERR_DATASIZE, \ + "The width dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(outTile) == XAI_TILE4D_GET_DIM3(inTile) * \ + XAI_CNN_REORG4D_GET_STRIDEY(params), XAI_ERR_DATASIZE, \ + "The height dimension of inTile and outTile is inconsistent"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(outTile) == XAI_CNN_REORG4D_GET_STRIDEX(params) * \ + XAI_CNN_REORG4D_GET_STRIDEY(params) * XAI_TILE4D_GET_DIM4(inTile), \ + XAI_ERR_DATASIZE, "The batch dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) * XAI_CNN_REORG4D_GET_STRIDEX(params) == \ + XAI_TILE4D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "The width dimension of inTile and outTile is inconsistent"); \ + \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(outTile) * XAI_CNN_REORG4D_GET_STRIDEY(params) == \ + XAI_TILE4D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "The height dimension of inTile and outTile is inconsistent"); \ + } + +#define XAI_CHECK_REORG_PARAMS_WHD(inT, outT, param) \ + if (XAI_CNN_REORG_GET_REVERSE(param)) \ + { \ + XAI_CHECK_ERROR((XAI_CNN_REORG_GET_STRIDE(param) * XAI_CNN_REORG_GET_STRIDE(param) * \ + XAI_TILE3D_GET_DIM3(outT)) == XAI_TILE3D_GET_DIM3(inT), XAI_ERR_DATASIZE, \ + "Number of output channels is strideX * strideY times number of input channels"); \ + \ + XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1(inT) == \ + XAI_TILE3D_GET_DIM1(outT), XAI_ERR_DATASIZE, \ + "Input width is strideX times output width"); \ + \ + XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM2(inT) == \ + XAI_TILE3D_GET_DIM2(outT), XAI_ERR_DATASIZE, \ + "Input height is strideY times output height"); \ + \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_CNN_REORG_GET_STRIDE(param) * XAI_CNN_REORG_GET_STRIDE(param) * \ + XAI_TILE3D_GET_DIM3(inT)) == XAI_TILE3D_GET_DIM3(outT), XAI_ERR_DATASIZE, \ + "Number of output channels is strideX * strideY times number of input channels"); \ + \ + XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM1(outT) == \ + XAI_TILE3D_GET_DIM1(inT), XAI_ERR_DATASIZE, \ + "Input width is strideX times output width"); \ + \ + XAI_CHECK_ERROR(XAI_CNN_REORG_GET_STRIDE(param) * XAI_TILE3D_GET_DIM2(outT) == \ + XAI_TILE3D_GET_DIM2(inT), XAI_ERR_DATASIZE, \ + "Input height is strideY times output height"); \ + } + +#if XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR +#define XAI_CHECK_INTERP_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate, xSrcCoordinate, ySrcCoordinate, zSrcCoordinate, \ + xScale, yScale, xShift, yShift, inDataWidth, inDataHeight, inDataDepth, outDataWidth, outDataHeight, outDataDepth, \ + edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, edge2AcrossHeight, \ + inFrameWidth, inFrameHeight) \ + { \ + int32_t insideFrameX; \ + int32_t insideFrameY; \ + \ + int32_t xmax = (((xDstCoordinate + outDataWidth - 1) * xScale + xShift) >> 18) + 1; \ + int32_t ymax = (((yDstCoordinate + outDataHeight - 1) * yScale + yShift) >> 18) + 1; \ + int32_t zmax = (zDstCoordinate + outDataDepth); \ + \ + insideFrameX = (xmax < inFrameWidth); \ + insideFrameY = (ymax < inFrameHeight); \ + \ + XAI_CHECK_ERROR(((((xDstCoordinate * xScale + xShift < 0) || ((xDstCoordinate * xScale + xShift) >> 18) >= (xSrcCoordinate - edge1AcrossWidth))) && \ + (((yDstCoordinate * yScale + yShift < 0) || ((yDstCoordinate * yScale + yShift) >> 18) >= (ySrcCoordinate - edge1AcrossHeight))) && \ + (((zDstCoordinate) >= (zSrcCoordinate))) && \ + (((xmax + insideFrameX) <= (xSrcCoordinate + inDataWidth + edge2AcrossWidth))) && \ + (((ymax + insideFrameY) <= (ySrcCoordinate + inDataHeight + edge2AcrossHeight))) && \ + ((zmax <= (zSrcCoordinate + inDataDepth)))), \ + XAI_ERR_DATASIZE, "The input tile size requirements is in sufficient"); \ + } +#else +#define XAI_CHECK_INTERP_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate, xSrcCoordinate, ySrcCoordinate, zSrcCoordinate, \ + xScale, yScale, xShift, yShift, inDataWidth, inDataHeight, inDataDepth, outDataWidth, outDataHeight, outDataDepth, \ + edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, edge2AcrossHeight, \ + inFrameWidth, inFrameHeight) +#endif + +#if XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR +#define XAI_CHECK_RESIZENEAREST_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate, \ + xSrcCoordinate, ySrcCoordinate, zSrcCoordinate, \ + xScale, yScale, xShift, yShift, \ + inDataWidth, inDataHeight, inDataDepth, \ + outDataWidth, outDataHeight, outDataDepth, \ + edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, \ + edge2AcrossHeight, inFrameWidth, inFrameHeight) \ + { \ + int32_t xmin = ((xDstCoordinate * xScale) + xShift); \ + int32_t ymin = ((yDstCoordinate * yScale) + yShift); \ + int32_t zmin = (zDstCoordinate); \ + int32_t xmax = (((xDstCoordinate + outDataWidth - 1) * xScale + xShift) >> 18) + 1; \ + int32_t ymax = (((yDstCoordinate + outDataHeight - 1) * yScale + yShift) >> 18) + 1; \ + int32_t zmax = (zDstCoordinate + outDataDepth); \ + \ + int32_t insideFrameX = (xmax < inFrameWidth); \ + int32_t insideFrameY = (ymax < inFrameHeight); \ + \ + XAI_CHECK_ERROR((((xmin < 0 || (xmin >> 18) >= (xSrcCoordinate - edge1AcrossWidth))) && \ + ((ymin < 0 || (ymin >> 18) >= (ySrcCoordinate - edge1AcrossHeight))) && \ + (zmin >= (zSrcCoordinate)) && \ + ((xmax + insideFrameX) <= (xSrcCoordinate + inDataWidth + edge2AcrossWidth)) && \ + ((ymax + insideFrameY) <= (ySrcCoordinate + inDataHeight + edge2AcrossHeight)) && \ + (zmax <= (zSrcCoordinate + inDataDepth))), \ + XAI_ERR_DATASIZE, "The input tile size requirements is in sufficient"); \ + } +#else +#define XAI_CHECK_RESIZENEAREST_BOUNDARY(xDstCoordinate, yDstCoordinate, zDstCoordinate, \ + xSrcCoordinate, ySrcCoordinate, zSrcCoordinate, \ + xScale, yScale, xShift, yShift, \ + inDataWidth, inDataHeight, inDataDepth, \ + outDataWidth, outDataHeight, outDataDepth, \ + edge1AcrossWidth, edge2AcrossWidth, edge1AcrossHeight, \ + edge2AcrossHeight, inFrameWidth, inFrameHeight) +#endif + +#define XAI_CHECK_CONSISTENCY_MAXVALARR8(maxValArr, params, tileFlag) \ + { \ + if (XAI_CNN_MAXVAL_GET_TILEFLAG(params) != tileFlag) \ + { \ + XAI_CHECK_ARRAY_S8(maxValArr); \ + XAI_CHECK_ERROR((XAI_ARRAY_GET_WIDTH(maxValArr) >= XCHAL_IVPN_SIMD_WIDTH), \ + XAI_ERR_BADARG, "Length of maxValArr should not be less than XCHAL_IVPN_SIMD_WIDTH"); \ + XAI_CHECK_ERROR((XAI_ARRAY_GET_HEIGHT(maxValArr) > 0), XAI_ERR_BADARG, \ + "maxValArr height parameter is not set as required"); \ + } \ + } +#define XAI_CHECK_CONSISTENCY_MAXVALARR(maxValArr, params, tileFlag) \ + { \ + if (XAI_CNN_MAXVAL_GET_TILEFLAG(params) != tileFlag) \ + { \ + XAI_CHECK_ARRAY_S16(maxValArr); \ + XAI_CHECK_ERROR((XAI_ARRAY_GET_WIDTH(maxValArr) >= XCHAL_IVPN_SIMD_WIDTH), \ + XAI_ERR_BADARG, "Length of maxValArr should not be less than XCHAL_IVPN_SIMD_WIDTH"); \ + XAI_CHECK_ERROR((XAI_ARRAY_GET_HEIGHT(maxValArr) > 0), XAI_ERR_BADARG, \ + "maxValArr height parameter is not set as required"); \ + } \ + } + +#define XAI_CHECK_PERMUTE_PARAMS(params) \ + XAI_CHECK_ERROR((XAI_CNN_PERMUTE4D_GET_ORDER1(params) > 0 && XAI_CNN_PERMUTE4D_GET_ORDER2(params) > 0 && \ + XAI_CNN_PERMUTE4D_GET_ORDER3(params) > 0 && XAI_CNN_PERMUTE4D_GET_ORDER4(params) > 0), \ + XAI_ERR_BADARG, "The order should be greater than 0"); \ + XAI_CHECK_ERROR((XAI_CNN_PERMUTE4D_GET_ORDER1(params) < 5 && XAI_CNN_PERMUTE4D_GET_ORDER2(params) < 5 && \ + XAI_CNN_PERMUTE4D_GET_ORDER3(params) < 5 && XAI_CNN_PERMUTE4D_GET_ORDER4(params) < 5), \ + XAI_ERR_BADARG, "The order should be greater than 0"); \ + XAI_CHECK_ERROR(((XAI_CNN_PERMUTE4D_GET_ORDER1(params) != XAI_CNN_PERMUTE4D_GET_ORDER2(params)) && \ + (XAI_CNN_PERMUTE4D_GET_ORDER1(params) != XAI_CNN_PERMUTE4D_GET_ORDER3(params)) && \ + (XAI_CNN_PERMUTE4D_GET_ORDER1(params) != XAI_CNN_PERMUTE4D_GET_ORDER4(params)) && \ + (XAI_CNN_PERMUTE4D_GET_ORDER2(params) != XAI_CNN_PERMUTE4D_GET_ORDER3(params)) && \ + (XAI_CNN_PERMUTE4D_GET_ORDER2(params) != XAI_CNN_PERMUTE4D_GET_ORDER4(params)) && \ + (XAI_CNN_PERMUTE4D_GET_ORDER3(params) != XAI_CNN_PERMUTE4D_GET_ORDER4(params))), \ + XAI_ERR_BADARG, "The order values should not be equal to one another"); + +#if XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR +#define XAI_CHECK_CONSISTENCY_PERMUTE(inT, outT, params) \ + { \ + uint8_t order[4] = { XAI_CNN_PERMUTE4D_GET_ORDER1(params), \ + XAI_CNN_PERMUTE4D_GET_ORDER2(params), \ + XAI_CNN_PERMUTE4D_GET_ORDER3(params), \ + XAI_CNN_PERMUTE4D_GET_ORDER4(params) }; \ + int32_t inDim[4] = { XAI_TILE4D_GET_DIM1(inT), \ + XAI_TILE4D_GET_DIM2(inT), \ + XAI_TILE4D_GET_DIM3(inT), \ + XAI_TILE4D_GET_DIM4(inT) }; \ + \ + const int32_t transposedDim1 = inDim[order[0] - 1]; \ + const int32_t transposedDim2 = inDim[order[1] - 1]; \ + const int32_t transposedDim3 = inDim[order[2] - 1]; \ + const int32_t transposedDim4 = inDim[order[3] - 1]; \ + XAI_CHECK_ERROR((transposedDim1 == XAI_TILE4D_GET_DIM1(outT) && transposedDim2 == XAI_TILE4D_GET_DIM2(outT) \ + && transposedDim3 == XAI_TILE4D_GET_DIM3(outT) && transposedDim4 == XAI_TILE4D_GET_DIM4(outT)), \ + XAI_ERR_DATASIZE, "The dimensions of the output tile should be equal to the transposed dimensions of the \ + input tile whose order is specified by the parameter in the xai_cnn_permute4D_params structure"); \ + } +#else +#define XAI_CHECK_CONSISTENCY_PERMUTE(inT, outT, params) +#endif +#endif + +#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM1(inTile, outTileIdx, outTileVal, numLargestVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_TILE3D_U16(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_MERGE_TOPK_ARGMAX_ARGMIN_3D_DIM1(inTileIdx, inTileVal, outTileIdx, outTileVal, numVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == numVal, XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTileVal), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTileVal), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_TILE3D_S32(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileIdx, outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileVal, outTileIdx); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == numVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTileVal), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTileVal), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTileVal) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileVal, outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTileIdx, outTileVal); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } +#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM2(inTile, outTileIdx, outTileVal, numLargestVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_TILE3D_U16(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM3(inTile, outTileIdx, outTileVal, numLargestVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_TILE3D_U16(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output value tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output value tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output value tile size is incorrect"); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM1_F32(inTile, outTileIdx, outTileVal, numLargestVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_TILE3D_U32(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM2_F32(inTile, outTileIdx, outTileVal, numLargestVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_TILE3D_U32(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == XAI_TILE3D_GET_DIM3(inTile), XAI_ERR_DATASIZE, \ + "Output tile size is incorrect"); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } + +#define XAI_CHECK_CONSISTENCY_ARGMAX_3D_DIM3_F32(inTile, outTileIdx, outTileVal, numLargestVal) \ + { \ + if (outTileIdx != NULL) \ + { \ + XAI_CHECK_TILE3D_U32(outTileIdx); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileIdx); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileIdx); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileIdx) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileIdx) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileIdx) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output index tile size is incorrect"); \ + } \ + if (outTileVal != NULL) \ + { \ + XAI_CHECK_TILE3D(outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_TYPE(inTile) == XAI_TILE3D_GET_TYPE(outTileVal), XAI_ERR_DATATYPE, \ + "Data type of output tile must be same as input tile"); \ + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTileVal); \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(inTile, outTileVal); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTileVal) == numLargestVal, XAI_ERR_DATASIZE, \ + "Output value tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTileVal) == XAI_TILE3D_GET_DIM1(inTile), XAI_ERR_DATASIZE, \ + "Output value tile size is incorrect"); \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTileVal) == XAI_TILE3D_GET_DIM2(inTile), XAI_ERR_DATASIZE, \ + "Output value tile size is incorrect"); \ + } \ + if ((outTileVal != NULL) && (outTileIdx != NULL)) \ + { \ + XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(outTileIdx, outTileVal); \ + } \ + } + +#define XAI_CHECK_DIM_IN128DWH(coeffIn, coeffOut) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % 128) == 0, XAI_ERR_DATASIZE, \ + "The dimension 1 of the output tile should be a multiple of 128"); \ + \ + if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 7) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The allocated output channels size in the IN128DWH tile is a multiple of 128"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 7) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The dimension 2 of the output tile should be a multiple of 128"); \ + } \ + } +#if (XCHAL_IVPN_SIMD_WIDTH == 64) +#define XAI_CHECK_DIM_IN64DWH(coeffIn, coeffOut) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % 64) == 0, XAI_ERR_DATASIZE, \ + "The dimension 1 of the output tile should be a multiple of 64"); \ + \ + if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 6) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The allocated output channels size in the IN64DWH tile is a multiple of 64"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 6) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The dimension 2 of the output tile should be a multiple of 64"); \ + } \ + } + +#define XAI_CHECK_DIM_IN32DWH(coeffIn, coeffOut) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % 32) == 0, XAI_ERR_DATASIZE, \ + "The dimension 1 of the output tile should be a multiple of 32"); \ + \ + if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 5) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The allocated output channels size in the IN32DWH tile is a multiple of 32"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) << 5) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The dimension 2 of the output tile should be a multiple of 32"); \ + } \ + } + +#else +#define XAI_CHECK_DIM_IN64DWH(coeffIn, coeffOut) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % (2 * XCHAL_IVPN_SIMD_WIDTH)) == 0, XAI_ERR_DATASIZE, \ + "The dimension 1 of the output tile should be a multiple of 64"); \ + \ + if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * (2 * XCHAL_IVPN_SIMD_WIDTH)) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The allocated output channels size in the IN64DWH tile is a multiple of 64"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * (2 * XCHAL_IVPN_SIMD_WIDTH)) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), 2 * XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The dimension 2 of the output tile should be a multiple of 64"); \ + } \ + } + +#define XAI_CHECK_DIM_IN32DWH(coeffIn, coeffOut) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM1(coeffTileOut) % XCHAL_IVPN_SIMD_WIDTH) == 0, XAI_ERR_DATASIZE, \ + "The dimension 1 of the output tile should be a multiple of 32"); \ + \ + if ((XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_WHDN) || \ + (XAI_TILE4D_GET_DATA_ORDER(coeffTileIn) == XAI_DWHN)) \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * XCHAL_IVPN_SIMD_WIDTH) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM4(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The allocated output channels size in the IN32DWH tile is a multiple of 32"); \ + } \ + else \ + { \ + XAI_CHECK_ERROR((XAI_TILE4D_GET_DIM2(coeffTileOut) * XCHAL_IVPN_SIMD_WIDTH) == \ + (XAI_ALIGN_VAL(XAI_TILE4D_GET_DIM1(coeffTileIn), XCHAL_IVPN_SIMD_WIDTH)), XAI_ERR_DATASIZE, \ + "The dimension 2 of the output tile should be a multiple of 32"); \ + } \ + } +#endif + +#define XAI_CHECK_COEFF_IN_DATA_ORDER_FC(coeffIn) \ + { \ + XAI_CHECK_ERROR(((XAI_TILE3D_GET_DATA_ORDER(coeffIn) == XAI_NWHD) || \ + (XAI_TILE3D_GET_DATA_ORDER(coeffIn) == XAI_NDWH)), \ + XAI_ERR_BADARG, "\nData Order of the given tiles not supported"); \ + } + +/* To set appropriate pitch size for broadcast/normal elementwise operations */ +#define XAI_TILE3D_GET_BCAST23_PITCH(inTile1, inTile2, outTile, in1Stride, in2Stride, \ + in1Pitch1, in1Pitch2, in2Pitch1, in2Pitch2) \ + { \ + int32_t m_in1Dim2, m_in1Dim3, m_in2Dim2, m_in2Dim3; \ + m_in1Dim2 = (XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride; \ + m_in1Dim3 = (XAI_TILE3D_GET_DIM3(inTile1) + in1Stride - 1) / in1Stride; \ + m_in2Dim2 = (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride; \ + m_in2Dim3 = (XAI_TILE3D_GET_DIM3(inTile2) + in2Stride - 1) / in2Stride; \ + in1Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile1); \ + in1Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile1); \ + in2Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile2); \ + in2Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile2); \ + in1Pitch1 = m_in1Dim2 == XAI_TILE3D_GET_DIM2(outTile) ? in1Pitch1 : 0; \ + in1Pitch2 = m_in1Dim3 == XAI_TILE3D_GET_DIM3(outTile) ? in1Pitch2 : 0; \ + in2Pitch1 = m_in2Dim2 == XAI_TILE3D_GET_DIM2(outTile) ? in2Pitch1 : 0; \ + in2Pitch2 = m_in2Dim3 == XAI_TILE3D_GET_DIM3(outTile) ? in2Pitch2 : 0; \ + } + +#define XAI_CHECK_REDUCE_DIM(inTile, outTile, params) \ + { \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM1) != XAI_CNN_REDUCE_DIM1) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(inTile) == XAI_TILE3D_GET_DIM1(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim1size = %d, Output tile dim1size = %d\nFirst dimension of input and output tile must be equal", \ + XAI_TILE3D_GET_DIM1(inTile), XAI_TILE3D_GET_DIM1(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim1size = %d, size should be equal to 1", XAI_TILE3D_GET_DIM1(outTile)); \ + } \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM2) != XAI_CNN_REDUCE_DIM2) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(inTile) == XAI_TILE3D_GET_DIM2(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim2size = %d, Output tile dim2size = %d\nSecond dimension of input and output tile must be equal", \ + XAI_TILE3D_GET_DIM2(inTile), XAI_TILE3D_GET_DIM2(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM2(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim2size = %d, size should be equal to 1", XAI_TILE3D_GET_DIM2(outTile)); \ + } \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM3) != XAI_CNN_REDUCE_DIM3) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(inTile) == XAI_TILE3D_GET_DIM3(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim3size = %d, Output tile dim3size = %d\nThird dimension of input and output tile must be equal", \ + XAI_TILE3D_GET_DIM3(inTile), XAI_TILE3D_GET_DIM3(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM3(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim3size = %d, size should be equal to 1", XAI_TILE3D_GET_DIM3(outTile)); \ + } \ + } + +#define XAI_CHECK_REDUCE_DIM4D(inTile, outTile, params) \ + { \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM1) != XAI_CNN_REDUCE_DIM1) \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(inTile) == XAI_TILE4D_GET_DIM1(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim1size = %d, Output tile dim1size = %d\nInequality in first dimension", \ + XAI_TILE4D_GET_DIM1(inTile), XAI_TILE4D_GET_DIM1(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim1size = %d, output first dimension should be 1", XAI_TILE4D_GET_DIM1(outTile)); \ + } \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM2) != XAI_CNN_REDUCE_DIM2) \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(inTile) == XAI_TILE4D_GET_DIM2(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim2size = %d, Output tile dim2size = %d\nInequality in second dimension", \ + XAI_TILE4D_GET_DIM2(inTile), XAI_TILE4D_GET_DIM2(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM2(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim2size = %d, output second dimension should be 1", XAI_TILE4D_GET_DIM2(outTile)); \ + } \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM3) != XAI_CNN_REDUCE_DIM3) \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(inTile) == XAI_TILE4D_GET_DIM3(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim3size = %d, Output tile dim3size = %d\nInequality in third dimension", \ + XAI_TILE4D_GET_DIM3(inTile), XAI_TILE4D_GET_DIM3(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM3(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim3size = %d, output third dimension should be 1", XAI_TILE4D_GET_DIM3(outTile)); \ + } \ + if ((XAI_CNN_REDUCE_GET_CONFIG(params) & XAI_CNN_REDUCE_DIM4) != XAI_CNN_REDUCE_DIM4) \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(inTile) == XAI_TILE4D_GET_DIM4(outTile), XAI_ERR_DATASIZE, \ + "\nInput tile dim4size = %d, Output tile dim4size = %d\nInequality in fourth dimension", \ + XAI_TILE4D_GET_DIM4(inTile), XAI_TILE4D_GET_DIM4(outTile)); \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM4(outTile) == 1, XAI_ERR_DATASIZE, \ + "\nOutput tile dim3size = %d, output fourth dimension should be 1", XAI_TILE4D_GET_DIM4(outTile)); \ + } \ + XAI_CHECK_ERROR(XAI_CNN_REDUCE_GET_TILEFLAG(params) <= XAI_CNN_REDUCE_FIRST_LAST_TILE, XAI_ERR_BADARG, \ + "\nTile Flag = %hhu, Incorrect Tile Flag", XAI_CNN_REDUCE_GET_TILEFLAG(params)); \ + } + +#define XAI_CHECK_TILE3D_SIZE_BCAST_EQ(in, out, inStride) \ + if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_WHD) \ + { \ + XAI_CHECK_ERROR( \ + ((((XAI_TILE3D_GET_DIM1(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM1(out)) || (XAI_TILE3D_GET_DIM1(in) == 1)) && \ + (((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) || (XAI_TILE3D_GET_DIM2(in) == 1)) && \ + ((XAI_TILE3D_GET_DIM3(in) == XAI_TILE3D_GET_DIM3(out)) || XAI_TILE3D_GET_DIM3(in) == 1)), XAI_ERR_DATASIZE, \ + "Invalid dimension in (" #in ") or (" #out ") to perform Elementwise broadcast operation"); \ + } \ + else if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_DWH) \ + { \ + XAI_CHECK_ERROR( \ + ((((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) || (XAI_TILE3D_GET_DIM2(in) == 1)) && \ + (((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out)) || (XAI_TILE3D_GET_DIM3(in) == 1)) && \ + ((XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)) || XAI_TILE3D_GET_DIM1(in) == 1)), XAI_ERR_DATASIZE, \ + "Invalid dimension in (" #in ") or (" #out ") to perform Elementwise broadcast operation"); \ + } + +#define XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST(in, out, inStride) \ + if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_WHD) \ + { \ + if ((XAI_TILE3D_GET_DIM3(in) == XAI_TILE3D_GET_DIM3(out)) && \ + ((XAI_TILE3D_GET_DIM1(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM1(out)) && \ + ((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out))) \ + { \ + if (XAI_TILE3D_GET_DATA_PTR(in) == XAI_TILE3D_GET_DATA_PTR(out)) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(in, out), XAI_ERR_INPLACE, "Inplace operation not " \ + "supported when pitch of ("#in ") and ("#out ") are not same"); \ + } \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_PTR(in) != XAI_TILE3D_GET_DATA_PTR(out), XAI_ERR_INPLACE, \ + "Inplace operation not supported for Broadcast Operation for ("#in ") and ("#out ")"); \ + XAI_CHECK_TILE3D_SIZE_BCAST_EQ(in, out, inStride); \ + } \ + } \ + else if (XAI_TILE3D_GET_DATA_ORDER(in) == XAI_DWH) \ + { \ + if ((XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)) && \ + ((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) && \ + ((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out))) \ + { \ + if (XAI_TILE3D_GET_DATA_PTR(in) == XAI_TILE3D_GET_DATA_PTR(out)) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(in, out), XAI_ERR_INPLACE, "Inplace operation not " \ + "supported when pitch of ("#in ") and ("#out ") are not same"); \ + } \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_PTR(in) != XAI_TILE3D_GET_DATA_PTR(out), XAI_ERR_INPLACE, \ + "Inplace operation not supported for Broadcast Operation for ("#in ") and ("#out ")"); \ + XAI_CHECK_TILE3D_SIZE_BCAST_EQ(in, out, inStride); \ + } \ + } + +#define XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, in1Stride, in2Stride) \ + XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST(inTile1, outTile, in1Stride); \ + XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST(inTile2, outTile, in2Stride); \ + if (XAI_TILE3D_GET_DATA_ORDER(outTile) == XAI_WHD) \ + { \ + XAI_CHECK_ERROR((MAX2(XAI_TILE3D_GET_DIM3(inTile1), XAI_TILE3D_GET_DIM3(inTile2)) == XAI_TILE3D_GET_DIM3(outTile)) && \ + (MAX2((XAI_TILE3D_GET_DIM1(inTile1) + in1Stride - 1) / in1Stride, \ + (XAI_TILE3D_GET_DIM1(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM1(outTile)) && \ + (MAX2((XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride, \ + (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM2(outTile)), \ + XAI_ERR_DATASIZE, "Invalid dimension to perform BroadCast/ElementWise operations"); \ + } \ + else if (XAI_TILE3D_GET_DATA_ORDER(outTile) == XAI_DWH) \ + { \ + XAI_CHECK_ERROR((MAX2(XAI_TILE3D_GET_DIM1(inTile1), XAI_TILE3D_GET_DIM1(inTile2)) == XAI_TILE3D_GET_DIM1(outTile)) && \ + (MAX2((XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride, \ + (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM2(outTile)) && \ + (MAX2((XAI_TILE3D_GET_DIM3(inTile1) + in1Stride - 1) / in1Stride, \ + (XAI_TILE3D_GET_DIM3(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM3(outTile)), \ + XAI_ERR_DATASIZE, "Invalid dimension to perform BroadCast/ElementWise operations"); \ + } + +#define XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2) \ + int32_t inTile1Pitch0 = 1; \ + int32_t inTile1Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile1); \ + int32_t inTile1Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile1); \ + int32_t inTile2Pitch0 = 1; \ + int32_t inTile2Pitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile2); \ + int32_t inTile2Pitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile2); \ + if (XAI_TILE3D_GET_DIM1(inTile1) == 1) { \ + inTile1Pitch0 = 0; } \ + else if (XAI_TILE3D_GET_DIM1(inTile2) == 1) { \ + inTile2Pitch0 = 0; } \ + if (XAI_TILE3D_GET_DIM2(inTile1) == 1) { \ + inTile1Pitch1 = 0; } \ + else if (XAI_TILE3D_GET_DIM2(inTile2) == 1) { \ + inTile2Pitch1 = 0; } \ + if (XAI_TILE3D_GET_DIM3(inTile1) == 1) { \ + inTile1Pitch2 = 0; } \ + else if (XAI_TILE3D_GET_DIM3(inTile2) == 1) { \ + inTile2Pitch2 = 0; } + +#define XAI_TILE3D_SIZE_BCAST23_EQ(in, out, inStride) \ + ((((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) || (XAI_TILE3D_GET_DIM2(in) == 1)) && \ + (((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out)) || (XAI_TILE3D_GET_DIM3(in) == 1)) && \ + (XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out))) + +#define XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST23(in, out, inStride) \ + if ((XAI_TILE3D_GET_DIM1(in) == XAI_TILE3D_GET_DIM1(out)) && \ + ((XAI_TILE3D_GET_DIM2(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM2(out)) && \ + ((XAI_TILE3D_GET_DIM3(in) + inStride - 1) / inStride == XAI_TILE3D_GET_DIM3(out))) \ + { \ + if (XAI_TILE3D_GET_DATA_PTR(in) == XAI_TILE3D_GET_DATA_PTR(out)) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(in, out), XAI_ERR_INPLACE, "Inplace operation not " \ + "supported when pitch of ("#in ") and ("#out ") are not same"); \ + } \ + } \ + else \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_PTR(in) != XAI_TILE3D_GET_DATA_PTR(out), XAI_ERR_INPLACE, \ + "Inplace operation not supported for Broadcast Operation for ("#in ") and ("#out ")"); \ + XAI_CHECK_ERROR(XAI_TILE3D_SIZE_BCAST23_EQ(in, out, inStride), XAI_ERR_DATASIZE, \ + "Invalid dimension in (" #in ") or (" #out ") to perform Elementwise broadcast operation"); \ + } + +#define XAI_CHECK_TILE3D_BCAST23_DIMENSIONS(inTile1, inTile2, outTile, in1Stride, in2Stride) \ + XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST23(inTile1, outTile, in1Stride) \ + XAI_CHECK_TILE3D_SIZE_EQ_OR_BCAST23(inTile2, outTile, in2Stride) \ + XAI_CHECK_ERROR((MAX2(XAI_TILE3D_GET_DIM1(inTile1), XAI_TILE3D_GET_DIM1(inTile2)) == \ + XAI_TILE3D_GET_DIM1(outTile)) && \ + (MAX2((XAI_TILE3D_GET_DIM2(inTile1) + in1Stride - 1) / in1Stride, \ + (XAI_TILE3D_GET_DIM2(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM2(outTile)) && \ + (MAX2((XAI_TILE3D_GET_DIM3(inTile1) + in1Stride - 1) / in1Stride, \ + (XAI_TILE3D_GET_DIM3(inTile2) + in2Stride - 1) / in2Stride) == XAI_TILE3D_GET_DIM3(outTile)), \ + XAI_ERR_DATASIZE, "Invalid dimension to perform BroadCast/ElementWise operations") + +#define XAI_CHECK_LSTM_BLOCK(functionCall) \ + { \ + int32_t retVal = (functionCall); \ + (void) retVal; \ + XAI_ERROR_CHECKS_CONTINUE() \ + { \ + XAI_CHECK_ERROR((retVal == XAI_ERR_OK), retVal, \ + "\nError in file: %s, function: %s, LSTM block: %s, line: %d\n", \ + __FILE__, __func__, #functionCall, __LINE__); \ + } \ + } \ + diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h new file mode 100644 index 00000000000..18e97cc9d49 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_cnn_version.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CNN_VERSION_H__ +#define __XAI_CNN_VERSION_H__ + +#if ((XCHAL_VISION_TYPE >= 6) || (XCHAL_HAVE_BBENEP == 1)) +#if (!defined(GLOW_BUILD) && !defined(MLIR_BUILD) && !defined(XNNC_PROJ_MGR_PROJECT)) +#include +#endif +#endif + +#if (XCHAL_VISION_TYPE == 6 && XCHAL_VISION_SIMD16 == 8) //VP1, V110 + +#define XAI_CNN_LIBRARY_DSP_PROCESSOR P1 +#define XAI_CNN_LIBRARY_VERSION_MAJOR 2 +#define XAI_CNN_LIBRARY_VERSION_MINOR 0 +#define XAI_CNN_LIBRARY_VERSION_PATCH 0 +#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING 0 + +#elif (XCHAL_VISION_TYPE == 6) // VP6, V130 + +#define XAI_CNN_LIBRARY_DSP_PROCESSOR P6 +#define XAI_CNN_LIBRARY_VERSION_MAJOR 2 +#define XAI_CNN_LIBRARY_VERSION_MINOR 0 +#define XAI_CNN_LIBRARY_VERSION_PATCH 0 +#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING 0 + +#elif ((XCHAL_VISION_TYPE == 7) || ((XCHAL_VISION_TYPE == 9) && (XCHAL_IVPN_SIMD_WIDTH == 32))) //VQ7, V240, V331, NeuroEdge +#define XAI_CNN_LIBRARY_DSP_PROCESSOR Q7 +#define XAI_CNN_LIBRARY_VERSION_MAJOR 2 +#define XAI_CNN_LIBRARY_VERSION_MINOR 0 +#define XAI_CNN_LIBRARY_VERSION_PATCH 0 +#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING 0 + +#elif ((XCHAL_VISION_TYPE >= 8) || ((XCHAL_HAVE_BBENEP == 1) && (XCHAL_BBEN_SIMD_WIDTH == 64))) // VQ8, V240, V341, MathX_240 + +#define XAI_CNN_LIBRARY_DSP_PROCESSOR Q8 +#define XAI_CNN_LIBRARY_VERSION_MAJOR 2 +#define XAI_CNN_LIBRARY_VERSION_MINOR 0 +#define XAI_CNN_LIBRARY_VERSION_PATCH 0 +#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING 0 + +#elif (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5) //HiFi + +#define XAI_CNN_LIBRARY_DSP_PROCESSOR HIFI +#define XAI_CNN_LIBRARY_VERSION_MAJOR 2 +#define XAI_CNN_LIBRARY_VERSION_MINOR 0 +#define XAI_CNN_LIBRARY_VERSION_PATCH 0 +#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING 0 + +#else + +#define XAI_CNN_LIBRARY_DSP_PROCESSOR REFF +#define XAI_CNN_LIBRARY_VERSION_MAJOR 2 +#define XAI_CNN_LIBRARY_VERSION_MINOR 0 +#define XAI_CNN_LIBRARY_VERSION_PATCH 0 +#define XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING 0 +#endif //if Processor type + +#define XAI_AUX_STR_EXP(__A) #__A +#define XAI_AUX_STR(__A) XAI_AUX_STR_EXP(__A) +#define XAI_CNN_LIBRARY_VERSION_STR XAI_AUX_STR(XAI_CNN_LIBRARY_DSP_PROCESSOR) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_MAJOR) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_MINOR) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_PATCH) "." XAI_AUX_STR(XAI_CNN_LIBRARY_VERSION_INTERNAL_TRACKING) +#endif /* __XAI_CNN_VERSION_H__ */ diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h b/backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h new file mode 100644 index 00000000000..2e2b6811fea --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_config_api.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2021 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#ifndef __XAI_CONFIG_API_H__ +#define __XAI_CONFIG_API_H__ + +#ifndef XAI_REF_ONLY_COMPILATION +#include +#endif + +// Contains IVP to BBE mappings +#if (XCHAL_HAVE_BBENEP == 1) +#include +#endif + +#include "xai_cnn_version.h" + +#ifndef __XTENSA__ + #if defined(_MSC_VER) + #pragma warning (disable : 4005 ) + #endif + #ifdef __cplusplus + #if defined(_MSC_VER) && (_MSC_VER >= 1900) + #define restrict __restrict + #else + #define restrict + #endif + #endif + #ifndef XCHAL_NUM_DATARAM + #define XCHAL_NUM_DATARAM 2 + #endif +#endif + +#if !defined(__XTENSA__) || !(defined(XCHAL_HAVE_VISION) || defined(XCHAL_HAVE_BBENEP)) || !(XCHAL_HAVE_VISION || XCHAL_HAVE_BBENEP) +# define XV_EMULATE_DMA +#endif + +// #define XAI_EMULATE_LOCAL_RAM 0 +#ifndef XAI_EMULATE_LOCAL_RAM +# define XAI_EMULATE_LOCAL_RAM 1 +#endif + +/* XI Library API qualifiers */ + +#if XAI_EMULATE_LOCAL_RAM && __XTENSA__ +#if XCHAL_NUM_DATARAM == 2 +# define _XAI_LOCAL_RAM0_ __attribute__((section(".dram0.data"))) +# define _XAI_LOCAL_RAM1_ __attribute__((section(".dram1.data"))) +#elif XCHAL_NUM_DATARAM == 1 +# define _XAI_LOCAL_RAM0_ __attribute__((section(".dram0.data"))) +#endif +# define _XAI_LOCAL_IRAM_ __attribute__((section(".iram0.text"))) +#else +# define _XAI_LOCAL_RAM0_ +# define _XAI_LOCAL_RAM1_ +# define _XAI_LOCAL_IRAM_ +#endif + +#if !defined(_XAI_EXPORTS_) +# if defined __GNUC__ && __GNUC__ >= 4 +# define _XAI_EXPORTS_ __attribute__((visibility("default"))) +# elif defined(_MSC_VER) +# if defined(XAI_CREATE_SHARED_LIBRARY) +# define _XAI_EXPORTS_ __declspec(dllexport) +# else +# define _XAI_EXPORTS_ __declspec(dllimport) +# endif +# else +# define _XAI_EXPORTS_ +# endif +#endif + +#ifdef __cplusplus +# define _XAI_EXTERN_C_ extern "C" +#else +# define _XAI_EXTERN_C_ extern +#endif + +#ifdef __cplusplus +# define XAI_DEFAULT(value) = (value) +#else +# define XAI_DEFAULT(value) +#endif + +#if defined(__XTENSA__) && (!defined(DISABLE_AGGRESSIVE_INLINE)) +#define _XAI_INLINE_ __attribute((always_inline)) +#else +#define _XAI_INLINE_ +#endif + +#ifdef GLOW_SPECIAL_BUILD +# define _XAI_API_ _XAI_EXTERN_C_ +# define _XAI_API_VAR_ _XAI_API_ +#else +# define _XAI_API_ _XAI_EXTERN_C_ _XAI_EXPORTS_ _XAI_INLINE_ +# define _XAI_API_VAR_ _XAI_EXTERN_C_ _XAI_EXPORTS_ +#endif + +/* error check levels */ + +/* do not check arguments for errors */ +#define XAI_ERROR_LEVEL_NO_ERROR 0 +/* call exit(-1) in case of error */ +#define XAI_ERROR_LEVEL_TERMINATE_ON_ERROR 1 +/* return corresponding error code on error without any processing (recommended)*/ +#define XAI_ERROR_LEVEL_RETURN_ON_ERROR 2 +/* capture error but attempt continue processing (dangerous!) */ +#define XAI_ERROR_LEVEL_CONTINUE_ON_ERROR 3 +/* print error message to stdout and return without any processing */ +#define XAI_ERROR_LEVEL_PRINT_ON_ERROR 4 +/* print error message but attempt continue processing (dangerous!) */ +#define XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR 5 + +#ifndef XAI_ERROR_LEVEL +# define XAI_ERROR_LEVEL XAI_ERROR_LEVEL_RETURN_ON_ERROR +#endif +#endif diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_core.h b/backends/cadence/vision/third-party/libxai_common/include/xai_core.h new file mode 100644 index 00000000000..010a3a48ad9 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_core.h @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __XAI_CORE_H__ +#define __XAI_CORE_H__ + +/* Force-disable DRAM boundary checks so XAI kernels accept system memory pointers. + Required for cache-variant convolution which operates on system memory directly. */ +#ifndef SYS_MEM_TESTING +#define SYS_MEM_TESTING 1 +#endif +#ifndef XAI_ERROR_CHECKS_RELAXED_REF +#define XAI_ERROR_CHECKS_RELAXED_REF 1 +#endif + +#include "xai_core_api.h" + +#if defined(_MSC_VER) +#define isfinite _finite +#define __func__ __FUNCTION__ +#endif + +/* Linear congruential generator */ +#define RND_A 1103515245 +#define RND_LOG_M 31 +#define RND_C 12345 +#define GET_NEXT_RND(x_pr) (((RND_A) *(x_pr) + (RND_C)) & ((unsigned int) (1 << (RND_LOG_M)) - 1)) + +/* return 0 on success or required memory size on failure */ +_XAI_EXTERN_C_ size_t xaiFitArray_U8(const xai_pArray donor, xai_pArray rec, int width, int height, xai_bool aligned); +_XAI_EXTERN_C_ size_t xaiFitArray_U8S16(const xai_pArray donor, xai_pArray rec, int width, int height, xai_bool aligned); +_XAI_EXTERN_C_ size_t xaiFitArray_S16(const xai_pArray donor, xai_pArray rec, int width, int height, xai_bool aligned); +_XAI_EXTERN_C_ size_t xaiFitTile_U8(const xai_pTile2D donor, xai_pTile2D rec, int width, int height, xai_bool aligned); +_XAI_EXTERN_C_ size_t xaiFitTile_S16(const xai_pTile2D donor, xai_pTile2D rec, int width, int height, xai_bool aligned); + +#define XAI_FIT_ALIGNED 1 +#define XAI_FIT_ANY 0 + + +// error check macro +#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_ON_ERROR || XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR +# include +#endif + +#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_TERMINATE_ON_ERROR +# include +#endif + +#define MARK_VAR_AS_USED(var) (void) (var) + +#if (XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR) +# define XAI_ERROR_CHECKS() XAI_ERR_TYPE __xai_local_err_code = XAI_ERR_OK; +# define XAI_ERROR_CHECKS_CONTINUE() +# define XAI_ERROR_STATUS() __xai_local_err_code +#else +# define XAI_ERROR_CHECKS() while (0) +# define XAI_ERROR_CHECKS_CONTINUE() while (0) +# define XAI_ERROR_STATUS() XAI_ERR_OK +#endif + +#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_TERMINATE_ON_ERROR +# define XAI_CHECK_ERROR(condition, code, ...) \ + if (condition) {} else exit(-1) +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_RETURN_ON_ERROR +# define XAI_CHECK_ERROR(condition, code, ...) \ + if (condition) {} else return (code) +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_CONTINUE_ON_ERROR +# define XAI_CHECK_ERROR(condition, code, ...) \ + if (condition) {} else __xai_local_err_code = (code) +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_ON_ERROR +# define XAI_CHECK_ERROR(condition, code, ...) \ + do { if (!(condition)) { printf("%s:%d: Error #%d (%s) in function %s: ", __FILE__, __LINE__, (int) (code), xaiErrStr(code), __func__); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + fflush(stdout); return code; } } while (0) +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR +# define XAI_CHECK_ERROR(condition, code, ...) \ + do { if (!(condition)) { printf("%s:%d: Error #%d (%s) in function %s: ", __FILE__, __LINE__, (int) (code), xaiErrStr(code), __func__); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + fflush(stdout); return code; } } while (0) +#else +# define XAI_CHECK_ERROR(condition, code, ...) +#endif + +// helper macro +#define XAI_ARRAY_USEFUL_CAPACITY(array, ptr) ((ptrdiff_t) XAI_ARRAY_GET_BUFF_SIZE(array) - ((uint8_t *) (ptr) - (uint8_t *) XAI_ARRAY_GET_BUFF_PTR(array))) + +// macro for standard array/tile checks: + +// check that array/tile data is placed in the DRAM +#if XAI_EMULATE_LOCAL_RAM && __XTENSA__ +#if XCHAL_NUM_DATARAM == 2 +#define XAI_ARRAY_STARTS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) || \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM1_VADDR))) +#define XAI_ARRAY_ENDS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)) || \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM1_VADDR) + ((uint32_t) XCHAL_DATARAM1_SIZE)))) +#define XAI_TILE2D_STARTS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) || \ + (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM1_VADDR))) +#define XAI_TILE2D_ENDS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) + XAI_TILE2D_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE)) || \ + (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) + XAI_TILE2D_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM1_VADDR) + ((uint32_t) XCHAL_DATARAM1_SIZE)))) +#elif XCHAL_NUM_DATARAM == 1 +#define XAI_ARRAY_STARTS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR)) +#define XAI_ARRAY_ENDS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE))) +#define XAI_TILE2D_STARTS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR)) +#define XAI_TILE2D_ENDS_IN_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_TILE2D_GET_BUFF_PTR(t)) + XAI_TILE2D_GET_BUFF_SIZE(t) <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE))) +#endif + +#else //#XAI_EMULATE_LOCAL_RAM && __XTENSA__ +#define XAI_ARRAY_STARTS_IN_DRAM(t) 1 +#define XAI_ARRAY_ENDS_IN_DRAM(t) 1 +#define XAI_TILE2D_STARTS_IN_DRAM(t) 1 +#define XAI_TILE2D_ENDS_IN_DRAM(t) 1 +#endif //#XAI_EMULATE_LOCAL_RAM && __XTENSA__ + +// check the minimal alignment requirements +#define XAI_ARRAY_IS_WIDTH_ALIGNED(t) ((XAI_ARRAY_GET_WIDTH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_WIDTH_ALIGNED2(t) ((XAI_ARRAY_GET_WIDTH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_WIDTH_ALIGNED_2(t) ((XAI_ARRAY_GET_WIDTH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0) +#define XAI_ARRAY_IS_STRIDE_ALIGNED(t) ((XAI_ARRAY_GET_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_STRIDE_ALIGNED2(t) ((XAI_ARRAY_GET_PITCH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_STRIDE_ALIGNED_2(t) ((XAI_ARRAY_GET_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0) +#define XAI_ARRAY_IS_PTR_ALIGNED_NX8(t) ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_PTR_ALIGNED_2NX8(t) ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_PTR_ALIGNED_NX16(t) ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_ARRAY_IS_PTR_ALIGNED_N_2X32(t) ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) + +#define XAI_ARRAY_IS_ALIGNED_NX8(t) (XAI_ARRAY_IS_PTR_ALIGNED_NX8(t) && XAI_ARRAY_IS_WIDTH_ALIGNED(t) && XAI_ARRAY_IS_STRIDE_ALIGNED(t)) +#define XAI_ARRAY_IS_ALIGNED_2NX8(t) (XAI_ARRAY_IS_PTR_ALIGNED_2NX8(t) && XAI_ARRAY_IS_WIDTH_ALIGNED2(t) && XAI_ARRAY_IS_STRIDE_ALIGNED2(t)) +#define XAI_ARRAY_IS_ALIGNED_NX16(t) (XAI_ARRAY_IS_PTR_ALIGNED_NX16(t) && XAI_ARRAY_IS_WIDTH_ALIGNED(t) && XAI_ARRAY_IS_STRIDE_ALIGNED(t)) +#define XAI_ARRAY_IS_ALIGNED_N_2X32(t) (XAI_ARRAY_IS_PTR_ALIGNED_N_2X32(t) && XAI_ARRAY_IS_WIDTH_ALIGNED_2(t) && XAI_ARRAY_IS_STRIDE_ALIGNED_2(t)) + +#define XAI_TILE2D_IS_WIDTH_ALIGNED(t) XAI_ARRAY_IS_WIDTH_ALIGNED(t) +#define XAI_TILE2D_IS_WIDTH_ALIGNED2(t) XAI_ARRAY_IS_WIDTH_ALIGNED2(t) +#define XAI_TILE2D_IS_WIDTH_ALIGNED_2(t) XAI_ARRAY_IS_WIDTH_ALIGNED_2(t) +#define XAI_TILE2D_IS_STRIDE_ALIGNED(t) XAI_ARRAY_IS_STRIDE_ALIGNED(t) +#define XAI_TILE2D_IS_STRIDE_ALIGNED2(t) XAI_ARRAY_IS_STRIDE_ALIGNED2(t) +#define XAI_TILE2D_IS_STRIDE_ALIGNED_2(t) XAI_ARRAY_IS_STRIDE_ALIGNED_2(t) +#define XAI_TILE2D_IS_PTR_ALIGNED_NX8(t) XAI_ARRAY_IS_PTR_ALIGNED_NX8(t) +#define XAI_TILE2D_IS_PTR_ALIGNED_2NX8(t) XAI_ARRAY_IS_PTR_ALIGNED_2NX8(t) +#define XAI_TILE2D_IS_PTR_ALIGNED_NX16(t) XAI_ARRAY_IS_PTR_ALIGNED_NX16(t) +#define XAI_TILE2D_IS_PTR_ALIGNED_N_2X32(t) XAI_ARRAY_IS_PTR_ALIGNED_N_2X32(t) + +// check array invariants +#define XAI_ARRAY_IS_1D(t) (XAI_ARRAY_GET_HEIGHT(t) == 1) + +#define XAI_ARRAY_CHECK_TYPE(a, type) (XAI_TYPE_ELEMENT_TYPE(XAI_ARRAY_GET_TYPE(a)) == type) + +#define XAI_ARRAY_CHECK_ELEMENT_SIZE(a, size) (XAI_ARRAY_GET_ELEMENT_SIZE(a) == (size)) + +#define XAI_ARRAY_SIZE_EQ(t1, t2) (XAI_ARRAY_GET_WIDTH(t1) == XAI_ARRAY_GET_WIDTH(t2) && XAI_ARRAY_GET_HEIGHT(t1) == XAI_ARRAY_GET_HEIGHT(t2)) + +#define XAI_ARRAY_SIZE_GEQ(t1, t2) (XAI_ARRAY_GET_WIDTH(t1) >= XAI_ARRAY_GET_WIDTH(t2) && XAI_ARRAY_GET_HEIGHT(t1) >= XAI_ARRAY_GET_HEIGHT(t2)) + +#define XAI_ARRAYS_ARE_NOT_OVERLAP(t1, t2) (XAI_ARRAY_GET_DATA_PTR(t1) != XAI_ARRAY_GET_DATA_PTR(t2)) + +#define XAI_ARRAY_IS_CONSISTENT(a) \ + ((XAI_ARRAY_GET_PITCH(a) >= XAI_ARRAY_GET_WIDTH(a)) && \ + (XAI_ARRAY_GET_WIDTH(a) > 0) && (XAI_ARRAY_GET_HEIGHT(a) > 0) && \ + ((uint8_t *) XAI_ARRAY_GET_DATA_PTR(a) >= (uint8_t *) XAI_ARRAY_GET_BUFF_PTR(a)) && \ + ((uint8_t *) XAI_ARRAY_GET_DATA_PTR(a) + (XAI_ARRAY_GET_PITCH(a) * (XAI_ARRAY_GET_HEIGHT(a) - 1) + XAI_ARRAY_GET_WIDTH(a)) * XAI_ARRAY_GET_ELEMENT_SIZE(a) \ + <= (uint8_t *) XAI_ARRAY_GET_BUFF_PTR(a) + XAI_ARRAY_GET_BUFF_SIZE(a))) + +// common array error checks +#define XAI_CHECK_POINTER(pointer) \ + XAI_CHECK_ERROR(pointer != 0, XAI_ERR_NULLARG, "The pointer (" #pointer ") is NULL") + +#if ((defined(XCHAL_VISION_TYPE) && (XCHAL_VISION_TYPE >= 6)) || (defined(XCHAL_HAVE_BBENEP) && (XCHAL_HAVE_BBENEP == 1))) + +#define XAI_CHECK_BUFFER(array) \ + XAI_CHECK_POINTER(array); \ + XAI_CHECK_ERROR(XAI_ARRAY_STARTS_IN_DRAM(array), XAI_ERR_MEMLOCAL, "The argument (" #array ") data does not start in DRAM"); \ + XAI_CHECK_ERROR(XAI_ARRAY_ENDS_IN_DRAM(array), XAI_ERR_MEMLOCAL, "Complete data for the argument (" #array ") does not lie in DRAM") + +#else + +#define XAI_CHECK_BUFFER(array) \ + XAI_CHECK_POINTER(array); +#endif + +#define XAI_CHECK_ARRAY(array) \ + XAI_CHECK_BUFFER(array); \ + XAI_CHECK_ERROR(XAI_ARRAY_IS_CONSISTENT(array), XAI_ERR_BADARG, "The argument (" #array ") is invalid") + +#define XAI_CHECK_ARRAY_I(array, element_size) \ + XAI_CHECK_ARRAY(array); \ + XAI_CHECK_ERROR(XAI_ARRAY_CHECK_ELEMENT_SIZE(array, element_size) && \ + !((XAI_ARRAY_GET_TYPE(array)) & (XAI_TYPE_FLOAT_BIT)), \ + XAI_ERR_DATATYPE, "The argument (" #array ") has wrong type") + +#define XAI_CHECK_ARRAY_X(array, element_size) \ + XAI_CHECK_ARRAY(array); \ + XAI_CHECK_ERROR(XAI_ARRAY_CHECK_ELEMENT_SIZE(array, element_size), \ + XAI_ERR_DATATYPE, "The argument (" #array ") has wrong type") + +#define XAI_CHECK_ARRAY_T(array, type) \ + XAI_CHECK_ARRAY(array); \ + XAI_CHECK_ERROR(XAI_ARRAY_CHECK_TYPE(array, type), XAI_ERR_DATATYPE, "The argument (" #array ") has wrong type") + +#define XAI_CHECK_ARRAY_I8(array) XAI_CHECK_ARRAY_I(array, sizeof(int8_t)) +#define XAI_CHECK_ARRAY_I16(array) XAI_CHECK_ARRAY_I(array, sizeof(int16_t)) +#define XAI_CHECK_ARRAY_I32(array) XAI_CHECK_ARRAY_I(array, sizeof(int32_t)) + +#define XAI_CHECK_ARRAY_X16(array) XAI_CHECK_ARRAY_X(array, sizeof(int16_t)) +#define XAI_CHECK_ARRAY_X32(array) XAI_CHECK_ARRAY_X(array, sizeof(int32_t)) + +#define XAI_CHECK_ARRAY_U8(array) XAI_CHECK_ARRAY_T(array, XAI_U8) +#define XAI_CHECK_ARRAY_S8(array) XAI_CHECK_ARRAY_T(array, XAI_S8) +#define XAI_CHECK_ARRAY_U16(array) XAI_CHECK_ARRAY_T(array, XAI_U16) +#define XAI_CHECK_ARRAY_S16(array) XAI_CHECK_ARRAY_T(array, XAI_S16) +#define XAI_CHECK_ARRAY_U32(array) XAI_CHECK_ARRAY_T(array, XAI_U32) +#define XAI_CHECK_ARRAY_S32(array) XAI_CHECK_ARRAY_T(array, XAI_S32) +#define XAI_CHECK_ARRAY_S64(array) XAI_CHECK_ARRAY_T(array, XAI_S64) +#define XAI_CHECK_ARRAY_F16(array) XAI_CHECK_ARRAY_T(array, XAI_F16) +#define XAI_CHECK_ARRAY_F32(array) XAI_CHECK_ARRAY_T(array, XAI_F32) + +#define XAI_CHECK_ARRAY_IS_1D(array) \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_HEIGHT(array) == 1, XAI_ERR_BADARG, "The argument (" #array ") must be a 1D array") + +#define XAI_CHECK_ARRAYS_ARE_NOT_OVERLAP(array0, array1) \ + XAI_CHECK_ERROR(XAI_ARRAYS_ARE_NOT_OVERLAP(array0, array1), XAI_ERR_INPLACE, "Inplace operation is not supported") + +#define XAI_CHECK_ARRAY_ELEMENT_SIZE_EQ(array0, array1) \ + XAI_CHECK_ERROR(XAI_ARRAY_GET_ELEMENT_SIZE(array0) == XAI_ARRAY_GET_ELEMENT_SIZE(array1), \ + XAI_ERR_DATATYPE, "The (" #array0 ") element size must be equal to the (" #array1 ") element size") + +#define XAI_CHECK_ARRAY_SIZE_EQ(array0, array1) \ + XAI_CHECK_ERROR(XAI_ARRAY_SIZE_EQ(array0, array1), XAI_ERR_DATASIZE, "The (" #array0 ") argument size is not equal to the (" #array1 ") argument size") + +#define XAI_CHECK_ARRAY_SIZE_GEQ(array0, array1) \ + XAI_CHECK_ERROR(XAI_ARRAY_SIZE_GEQ(array0, array1), XAI_ERR_DATASIZE, "The (" #array0 ") argument size is not equal to OR greater than the (" #array1 ") argument size") + +#define XAI_CHECK_ARRAY_ALIGNMENT(array, DEPTH, ERR) \ + XAI_CHECK_ERROR(XAI_ARRAY_IS_ALIGNED_ ## DEPTH(array), XAI_ERR_ ## ERR, "The argument (" #array ") is not fully aligned") + +#define XAI_CHECK_ARRAY_IALIGNMENT_NX8(array) XAI_CHECK_ARRAY_ALIGNMENT(array, NX8, IALIGNMENT) +#define XAI_CHECK_ARRAY_IALIGNMENT_2NX8(array) XAI_CHECK_ARRAY_ALIGNMENT(array, 2NX8, IALIGNMENT) +#define XAI_CHECK_ARRAY_IALIGNMENT_NX16(array) XAI_CHECK_ARRAY_ALIGNMENT(array, NX16, IALIGNMENT) +#define XAI_CHECK_ARRAY_IALIGNMENT_N_2X32(array) XAI_CHECK_ARRAY_ALIGNMENT(array, N_2X32, IALIGNMENT) +#define XAI_CHECK_ARRAY_OALIGNMENT_NX8(array) XAI_CHECK_ARRAY_ALIGNMENT(array, NX8, OALIGNMENT) +#define XAI_CHECK_ARRAY_OALIGNMENT_2NX8(array) XAI_CHECK_ARRAY_ALIGNMENT(array, 2NX8, OALIGNMENT) +#define XAI_CHECK_ARRAY_OALIGNMENT_NX16(array) XAI_CHECK_ARRAY_ALIGNMENT(array, NX16, OALIGNMENT) +#define XAI_CHECK_ARRAY_OALIGNMENT_N_2X32(array) XAI_CHECK_ARRAY_ALIGNMENT(array, N_2X32, OALIGNMENT) + + +// check tile invariants +#define XAI_TILE2D_IS_CONSISTENT(t) \ + ((XAI_TILE2D_GET_PITCH(t) >= XAI_TILE2D_GET_WIDTH(t) + XAI_TILE2D_GET_EDGE_WIDTH(t) * 2) && \ + ((uint8_t *) XAI_TILE2D_GET_DATA_PTR(t) - (XAI_TILE2D_GET_EDGE_WIDTH(t) + XAI_TILE2D_GET_PITCH(t) * XAI_TILE2D_GET_EDGE_HEIGHT(t)) * XAI_TILE2D_GET_ELEMENT_SIZE(t) \ + >= (uint8_t *) XAI_TILE2D_GET_BUFF_PTR(t)) && \ + ((uint8_t *) XAI_TILE2D_GET_DATA_PTR(t) + (XAI_TILE2D_GET_PITCH(t) * (XAI_TILE2D_GET_HEIGHT(t) + XAI_TILE2D_GET_EDGE_HEIGHT(t) - 1) + XAI_TILE2D_GET_WIDTH(t) + XAI_TILE2D_GET_EDGE_WIDTH(t)) * XAI_TILE2D_GET_ELEMENT_SIZE(t) \ + <= (uint8_t *) XAI_TILE2D_GET_BUFF_PTR(t) + XAI_TILE2D_GET_BUFF_SIZE(t))) + +// common tile error checks +#define XAI_CHECK_TILE2D(tile) \ + XAI_CHECK_POINTER(tile); \ + XAI_CHECK_ERROR(XAI_TILE2D_IS_CONSISTENT(tile), XAI_ERR_BADARG, "The argument (" #tile ") is invalid"); \ + XAI_CHECK_ERROR(XAI_TILE2D_IS_TILE2D(tile), XAI_ERR_BADARG, "The argument (" #tile ") is not a tile"); \ + XAI_CHECK_ERROR(XAI_TILE2D_STARTS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "The argument (" #tile ") data does not start in DRAM"); \ + XAI_CHECK_ERROR(XAI_TILE2D_ENDS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "Complete data for the argument (" #tile ") does not lie in DRAM") + +#define XAI_TILE2D_CHECK_TYPE(a, type) \ + ((XAI_TYPE_ELEMENT_TYPE(XAI_TILE2D_GET_TYPE(a)) == type) && (XAI_TILE2D_IS_TILE2D(a))) + +#define XAI_CHECK_TILE2D_I(tile, element_size) \ + XAI_CHECK_TILE2D(tile); \ + XAI_CHECK_ERROR(XAI_ARRAY_CHECK_ELEMENT_SIZE(tile, element_size), \ + XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE2D_T(tile, type) \ + XAI_CHECK_TILE2D(tile); \ + XAI_CHECK_ERROR(XAI_TILE2D_CHECK_TYPE(tile, type), XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE2D_I8(array) XAI_CHECK_TILE2D_I(array, sizeof(int8_t)) +#define XAI_CHECK_TILE2D_I16(array) XAI_CHECK_TILE2D_I(array, sizeof(int16_t)) +#define XAI_CHECK_TILE2D_I32(array) XAI_CHECK_TILE2D_I(array, sizeof(int32_t)) + +#define XAI_CHECK_TILE2D_U8(array) XAI_CHECK_TILE2D_T(array, XAI_U8) +#define XAI_CHECK_TILE2D_S8(array) XAI_CHECK_TILE2D_T(array, XAI_S8) +#define XAI_CHECK_TILE2D_U16(array) XAI_CHECK_TILE2D_T(array, XAI_U16) +#define XAI_CHECK_TILE2D_S16(array) XAI_CHECK_TILE2D_T(array, XAI_S16) +#define XAI_CHECK_TILE2D_U32(array) XAI_CHECK_TILE2D_T(array, XAI_U32) +#define XAI_CHECK_TILE2D_S32(array) XAI_CHECK_TILE2D_T(array, XAI_S32) + +#define XAI_CHECK_TILE2D_EDGE(tile, edge) \ + XAI_CHECK_ERROR(XAI_TILE2D_GET_EDGE_WIDTH(tile) >= edge && XAI_TILE2D_GET_EDGE_HEIGHT(tile) >= edge, \ + XAI_ERR_EDGE, "The (" #tile ") tile must have at least " #edge "-pixel edge extension") + +#define XAI_CHECK_TILES_ARE_NOT_OVERLAP(tile0, tile1) XAI_CHECK_ARRAYS_ARE_NOT_OVERLAP(tile0, tile1) + +#define XAI_CHECK_TILE2D_IALIGNMENT_NX8(tile) XAI_CHECK_ARRAY_IALIGNMENT_NX8(tile) +#define XAI_CHECK_TILE2D_IALIGNMENT_2NX8(tile) XAI_CHECK_ARRAY_IALIGNMENT_2NX8(tile) +#define XAI_CHECK_TILE2D_IALIGNMENT_NX16(tile) XAI_CHECK_ARRAY_IALIGNMENT_NX16(tile) +#define XAI_CHECK_TILE2D_IALIGNMENT_N_2X32(tile) XAI_CHECK_ARRAY_IALIGNMENT_N_2X32(tile) +#define XAI_CHECK_TILE2D_OALIGNMENT_NX8(tile) XAI_CHECK_ARRAY_OALIGNMENT_NX8(tile) +#define XAI_CHECK_TILE2D_OALIGNMENT_2NX8(tile) XAI_CHECK_ARRAY_OALIGNMENT_2NX8(tile) +#define XAI_CHECK_TILE2D_OALIGNMENT_NX16(tile) XAI_CHECK_ARRAY_OALIGNMENT_NX16(tile) +#define XAI_CHECK_TILE2D_OALIGNMENT_N_2X32(tile) XAI_CHECK_ARRAY_OALIGNMENT_N_2X32(tile) + +// Checks for confinement of 3D and 4D tiles in single DRAM +#if XAI_EMULATE_LOCAL_RAM && __XTENSA__ && !SYS_MEM_TESTING +#if XCHAL_NUM_DATARAM == 2 +#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t) \ + ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) && \ + XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) \ + <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE))) || \ + ((XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM1_VADDR)) && \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) \ + <= (((uint32_t) XCHAL_DATARAM1_VADDR) + ((uint32_t) XCHAL_DATARAM1_SIZE))))) +#elif XCHAL_NUM_DATARAM == 1 +#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t) \ + (XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) >= ((uint32_t) XCHAL_DATARAM0_VADDR) && \ + XAI_PTR_TO_ADDR(XAI_ARRAY_GET_BUFF_PTR(t)) + XAI_ARRAY_GET_BUFF_SIZE(t) \ + <= (((uint32_t) XCHAL_DATARAM0_VADDR) + ((uint32_t) XCHAL_DATARAM0_SIZE))) +#endif +#define XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t) XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t) +#define XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t) XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t) +#else +#define XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t) 1 +#define XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t) 1 +#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t) 1 +#endif + +#define XAI_CHECK_TILE3D_FITS_IN_SINGLE_DRAM(t) \ + XAI_CHECK_ERROR(XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t), XAI_ERR_MEMLOCAL, \ + "Complete data for the argument (" #t ") does not fit in single DRAM"); + +#define XAI_CHECK_TILE4D_FITS_IN_SINGLE_DRAM(t) \ + XAI_CHECK_ERROR(XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t), XAI_ERR_MEMLOCAL, \ + "Complete data for the argument (" #t ") does not fit in single DRAM"); + +#define XAI_CHECK_ARRAY_FITS_IN_SINGLE_DRAM(parray) \ + XAI_CHECK_ERROR(XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(parray), XAI_ERR_MEMLOCAL, \ + "Complete data for the argument (" #parray ") does not fit in single DRAM"); + +#define XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(tile) \ + XAI_CHECK_ERROR(XAI_TILE2D_STARTS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "The argument (" #tile ") data does not start in DRAM"); \ + XAI_CHECK_ERROR(XAI_TILE2D_ENDS_IN_DRAM(tile), XAI_ERR_MEMLOCAL, "Complete data for the argument (" #tile ") does not lie in DRAM"); + +#define XAI_CHECK_TILE4D_IN_DRAM_BOUNDARY(tile) XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(tile) + + +// Checks for tile consistency +#define XAI_TILE3D_IS_CONSISTENT(t) \ + ((uint8_t *) XAI_TILE3D_GET_DATA_PTR(t) - (XAI_TILE3D_GET_DIM1_EDGE1(t) + XAI_TILE3D_GET_DIM1_PITCH(t) * XAI_TILE3D_GET_DIM2_EDGE1(t) \ + + XAI_TILE3D_GET_DIM2_PITCH(t) * XAI_TILE3D_GET_DIM3_EDGE1(t)) * XAI_TILE3D_GET_ELEMENT_SIZE(t) >= (uint8_t *) XAI_TILE3D_GET_BUFF_PTR(t)) && \ + ((uint8_t *) XAI_TILE3D_GET_DATA_PTR(t) + (XAI_TILE3D_GET_DIM2_PITCH(t) * (XAI_TILE3D_GET_DIM3(t) + XAI_TILE3D_GET_DIM3_EDGE2(t) - 1) \ + + XAI_TILE3D_GET_DIM1_PITCH(t) * (XAI_TILE3D_GET_DIM2(t) + XAI_TILE3D_GET_DIM2_EDGE2(t) - 1) \ + + XAI_TILE3D_GET_DIM1(t) + XAI_TILE3D_GET_DIM1_EDGE2(t)) * XAI_TILE3D_GET_ELEMENT_SIZE(t) \ + <= (uint8_t *) XAI_TILE3D_GET_BUFF_PTR(t) + XAI_TILE3D_GET_BUFF_SIZE(t)) && \ + (XAI_TILE3D_GET_BUFF_SIZE(t) != 0) && \ + (XAI_TILE3D_GET_DIM1(t) > 0) && (XAI_TILE3D_GET_DIM2(t) > 0) && (XAI_TILE3D_GET_DIM3(t) > 0) && \ + (XAI_TILE3D_GET_DIM1_PITCH(t) >= XAI_TILE3D_GET_DIM1(t) + XAI_TILE3D_GET_DIM1_EDGE1(t) + XAI_TILE3D_GET_DIM1_EDGE2(t)) + +#define XAI_TILE4D_IS_CONSISTENT(t) \ + ((uint8_t *) XAI_TILE4D_GET_DATA_PTR(t) - (XAI_TILE4D_GET_DIM1_EDGE1(t) + XAI_TILE4D_GET_DIM1_PITCH(t) * XAI_TILE4D_GET_DIM2_EDGE1(t) \ + + XAI_TILE4D_GET_DIM2_PITCH(t) * XAI_TILE4D_GET_DIM3_EDGE1(t)) * XAI_TILE4D_GET_ELEMENT_SIZE(t) >= (uint8_t *) XAI_TILE4D_GET_BUFF_PTR(t)) && \ + ((uint8_t *) XAI_TILE4D_GET_DATA_PTR(t) + (XAI_TILE4D_GET_DIM3_PITCH(t) * (XAI_TILE4D_GET_DIM4(t) - 1) \ + + XAI_TILE4D_GET_DIM2_PITCH(t) * (XAI_TILE4D_GET_DIM3(t) + XAI_TILE4D_GET_DIM3_EDGE2(t) - 1) \ + + XAI_TILE4D_GET_DIM1_PITCH(t) * (XAI_TILE4D_GET_DIM2(t) + XAI_TILE4D_GET_DIM2_EDGE2(t) - 1) \ + + XAI_TILE4D_GET_DIM1(t) + XAI_TILE4D_GET_DIM1_EDGE2(t)) * XAI_TILE4D_GET_ELEMENT_SIZE(t) \ + <= (uint8_t *) XAI_TILE4D_GET_BUFF_PTR(t) + XAI_TILE4D_GET_BUFF_SIZE(t)) && \ + (XAI_TILE4D_GET_BUFF_SIZE(t) != 0) && \ + (XAI_TILE4D_GET_DIM1(t) > 0) && (XAI_TILE4D_GET_DIM2(t) > 0) && (XAI_TILE4D_GET_DIM3(t) > 0) && (XAI_TILE4D_GET_DIM4(t) > 0) && \ + (XAI_TILE4D_GET_DIM1_PITCH(t) >= XAI_TILE4D_GET_DIM1(t) + XAI_TILE4D_GET_DIM1_EDGE1(t) + XAI_TILE4D_GET_DIM1_EDGE2(t)) + +#define XAI_TILE3D_SIZE_EQ(t1, t2) \ + (XAI_TILE3D_GET_DIM1(t1) == XAI_TILE3D_GET_DIM1(t2) && XAI_TILE3D_GET_DIM2(t1) == XAI_TILE3D_GET_DIM2(t2) && \ + XAI_TILE3D_GET_DIM3(t1) == XAI_TILE3D_GET_DIM3(t2)) + +#define XAI_TILE3D_PITCH_EQ(t1, t2) \ + (XAI_FRAME3D_GET_DIM1_PITCH(t1) == XAI_FRAME3D_GET_DIM1_PITCH(t2) && \ + XAI_FRAME3D_GET_DIM2_PITCH(t1) == XAI_FRAME3D_GET_DIM2_PITCH(t2)) + +// common tile error checks +#define XAI_CHECK_TILE3D(tile) \ + XAI_CHECK_POINTER(tile); \ + XAI_CHECK_ERROR(XAI_TILE3D_IS_CONSISTENT(tile), XAI_ERR_BADARG, "The argument (" #tile ") is invalid"); \ + XAI_CHECK_ERROR(XAI_TYPE_IS_TILE3D(XAI_TILE3D_GET_TYPE(tile)), XAI_ERR_BADARG, "The argument (" #tile ") is not a tile"); + + +#define XAI_TILE3D_CHECK_TYPE(a, type) \ + ((XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(a)) == type) && (XAI_TYPE_IS_TILE3D(XAI_TILE3D_GET_TYPE(a)))) + +#define XAI_TILE3D_CHECK_ELEMENT_SIZE(a, size) (XAI_ARRAY_GET_ELEMENT_SIZE(a) == (size)) + +#define XAI_CHECK_TILE3D_SIZE_EQ(t1, t2) \ + XAI_CHECK_ERROR(XAI_TILE3D_SIZE_EQ(t1, t2), XAI_ERR_DATASIZE, "Size of the ("#t1 ") and ("#t2 ") are not same"); \ + if (XAI_TILE3D_GET_DATA_PTR(t1) == XAI_TILE3D_GET_DATA_PTR(t2)) \ + { \ + XAI_CHECK_ERROR(XAI_TILE3D_PITCH_EQ(t1, t2), XAI_ERR_INPLACE, "Inplace operation not supported when pitch of " \ + "("#t1 ") and ("#t2 ") are not same"); \ + } + +#define XAI_CHECK_TILE3D_I(tile, element_size) \ + XAI_CHECK_TILE3D(tile); \ + XAI_CHECK_ERROR(XAI_TILE3D_CHECK_ELEMENT_SIZE(tile, element_size) && \ + !((XAI_TILE3D_GET_TYPE(tile)) & (XAI_TYPE_FLOAT_BIT)), \ + XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE3D_X(tile, element_size) \ + XAI_CHECK_TILE3D(tile); \ + XAI_CHECK_ERROR(XAI_TILE3D_CHECK_ELEMENT_SIZE(tile, element_size), \ + XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE3D_T(tile, type) \ + XAI_CHECK_TILE3D(tile); \ + XAI_CHECK_ERROR(XAI_TILE3D_CHECK_TYPE(tile, type), XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE3D_I8(array) XAI_CHECK_TILE3D_I(array, sizeof(int8_t)) +#define XAI_CHECK_TILE3D_I16(array) XAI_CHECK_TILE3D_I(array, sizeof(int16_t)) +#define XAI_CHECK_TILE3D_I32(array) XAI_CHECK_TILE3D_I(array, sizeof(int32_t)) +#define XAI_CHECK_TILE3D_I64(array) XAI_CHECK_TILE3D_I(array, sizeof(int64_t)) + +#define XAI_CHECK_TILE3D_X16(array) XAI_CHECK_TILE3D_X(array, sizeof(int16_t)) +#define XAI_CHECK_TILE3D_X32(array) XAI_CHECK_TILE3D_X(array, sizeof(int32_t)) + +#define XAI_CHECK_TILE3D_U8(array) XAI_CHECK_TILE3D_T(array, XAI_U8) +#define XAI_CHECK_TILE3D_S8(array) XAI_CHECK_TILE3D_T(array, XAI_S8) +#define XAI_CHECK_TILE3D_U16(array) XAI_CHECK_TILE3D_T(array, XAI_U16) +#define XAI_CHECK_TILE3D_S16(array) XAI_CHECK_TILE3D_T(array, XAI_S16) +#define XAI_CHECK_TILE3D_U32(array) XAI_CHECK_TILE3D_T(array, XAI_U32) +#define XAI_CHECK_TILE3D_S32(array) XAI_CHECK_TILE3D_T(array, XAI_S32) +#define XAI_CHECK_TILE3D_S64(array) XAI_CHECK_TILE3D_T(array, XAI_S64) +#define XAI_CHECK_TILE3D_F16(array) XAI_CHECK_TILE3D_T(array, XAI_F16) +#define XAI_CHECK_TILE3D_F32(array) XAI_CHECK_TILE3D_T(array, XAI_F32) + +// checks for 4D tiles +#define XAI_CHECK_TILE4D(tile) \ + XAI_CHECK_POINTER(tile); \ + XAI_CHECK_ERROR(XAI_TILE4D_IS_CONSISTENT(tile), XAI_ERR_BADARG, "The argument (" #tile ") is invalid"); \ + XAI_CHECK_ERROR(XAI_TYPE_IS_TILE4D(XAI_TILE4D_GET_TYPE(tile)), XAI_ERR_BADARG, "The argument (" #tile ") is not a tile"); + +#define XAI_TILE4D_SIZE_EQ(t1, t2) \ + (XAI_TILE4D_GET_DIM1(t1) == XAI_TILE4D_GET_DIM1(t2) && XAI_TILE4D_GET_DIM2(t1) == XAI_TILE4D_GET_DIM2(t2) && \ + XAI_TILE4D_GET_DIM3(t1) == XAI_TILE4D_GET_DIM3(t2) && XAI_TILE4D_GET_DIM4(t1) == XAI_TILE4D_GET_DIM4(t2)) + +#define XAI_TILE4D_CHECK_TYPE(a, type) \ + ((XAI_TYPE_ELEMENT_TYPE(XAI_TILE4D_GET_TYPE(a)) == type) && (XAI_TYPE_IS_TILE4D(XAI_TILE4D_GET_TYPE(a)))) + +#define XAI_TILE4D_CHECK_ELEMENT_SIZE(a, size) (XAI_ARRAY_GET_ELEMENT_SIZE(a) == (size)) + +#define XAI_CHECK_TILE4D_I(tile, element_size) \ + XAI_CHECK_TILE4D(tile); \ + XAI_CHECK_ERROR(XAI_TILE4D_CHECK_ELEMENT_SIZE(tile, element_size) && \ + !((XAI_TILE4D_GET_TYPE(tile)) & (XAI_TYPE_FLOAT_BIT)), \ + XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE4D_X(tile, element_size) \ + XAI_CHECK_TILE4D(tile); \ + XAI_CHECK_ERROR(XAI_TILE4D_CHECK_ELEMENT_SIZE(tile, element_size), \ + XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE4D_T(tile, type) \ + XAI_CHECK_TILE4D(tile); \ + XAI_CHECK_ERROR(XAI_TILE4D_CHECK_TYPE(tile, type), XAI_ERR_DATATYPE, "The argument (" #tile ") has wrong type") + +#define XAI_CHECK_TILE4D_SIZE_EQ(t1, t2) \ + XAI_CHECK_ERROR(XAI_TILE4D_SIZE_EQ(t1, t2), XAI_ERR_DATASIZE, "Size of the ("#t1 ") and ("#t2 ") is not same") + +#define XAI_CHECK_TILE4D_I8(array) XAI_CHECK_TILE4D_I(array, sizeof(int8_t)) +#define XAI_CHECK_TILE4D_I16(array) XAI_CHECK_TILE4D_I(array, sizeof(int16_t)) +#define XAI_CHECK_TILE4D_I32(array) XAI_CHECK_TILE4D_I(array, sizeof(int32_t)) + +#define XAI_CHECK_TILE4D_X16(array) XAI_CHECK_TILE4D_X(array, sizeof(int16_t)) +#define XAI_CHECK_TILE4D_X32(array) XAI_CHECK_TILE4D_X(array, sizeof(int32_t)) + +#define XAI_CHECK_TILE4D_U8(array) XAI_CHECK_TILE4D_T(array, XAI_U8) +#define XAI_CHECK_TILE4D_S8(array) XAI_CHECK_TILE4D_T(array, XAI_S8) +#define XAI_CHECK_TILE4D_U16(array) XAI_CHECK_TILE4D_T(array, XAI_U16) +#define XAI_CHECK_TILE4D_S16(array) XAI_CHECK_TILE4D_T(array, XAI_S16) +#define XAI_CHECK_TILE4D_F16(array) XAI_CHECK_TILE4D_T(array, XAI_F16) +#define XAI_CHECK_TILE4D_U32(array) XAI_CHECK_TILE4D_T(array, XAI_U32) +#define XAI_CHECK_TILE4D_S32(array) XAI_CHECK_TILE4D_T(array, XAI_S32) +#define XAI_CHECK_TILE4D_F32(array) XAI_CHECK_TILE4D_T(array, XAI_F32) + +// check the minimal alignment requirements for 3D tile +#define XAI_TILE3D_IS_STRIDE_ALIGNED(t) ((XAI_TILE3D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE3D_IS_STRIDE_ALIGNED2(t) ((XAI_TILE3D_GET_DIM1_PITCH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE3D_IS_STRIDE_ALIGNED_2(t) ((XAI_TILE3D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0) +#define XAI_TILE3D_IS_STRIDE_ALIGNED_4B(t) ((XAI_TILE3D_GET_DIM1_PITCH(t) & (3)) == 0) +#define XAI_TILE3D_IS_PTR_ALIGNED_NX8(t) ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE3D_IS_PTR_ALIGNED_2NX8(t) ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE3D_IS_PTR_ALIGNED_NX16(t) ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE3D_IS_PTR_ALIGNED_N_2X32(t) ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE3D_IS_PTR_ALIGNED_4B(t) ((XAI_PTR_TO_ADDR(XAI_TILE3D_GET_DATA_PTR(t)) & 3) == 0) + + +#define XAI_TILE3D_IS_ALIGNED_NX8(t) (XAI_TILE3D_IS_PTR_ALIGNED_NX8(t) && XAI_TILE3D_IS_STRIDE_ALIGNED(t)) +#define XAI_TILE3D_IS_ALIGNED_2NX8(t) (XAI_TILE3D_IS_PTR_ALIGNED_2NX8(t) && XAI_TILE3D_IS_STRIDE_ALIGNED2(t)) +#define XAI_TILE3D_IS_ALIGNED_NX16(t) (XAI_TILE3D_IS_PTR_ALIGNED_NX16(t) && XAI_TILE3D_IS_STRIDE_ALIGNED(t)) +#define XAI_TILE3D_IS_ALIGNED_N_2X32(t) (XAI_TILE3D_IS_PTR_ALIGNED_N_2X32(t) && XAI_TILE3D_IS_STRIDE_ALIGNED_2(t)) +#define XAI_TILE3D_IS_ALIGNED_4B(t) (XAI_TILE3D_IS_PTR_ALIGNED_4B(t) && XAI_TILE3D_IS_STRIDE_ALIGNED_4B(t)) + +#define XAI_CHECK_TILE3D_ALIGNMENT(array, DEPTH, ERR) \ + XAI_CHECK_ERROR(XAI_TILE3D_IS_ALIGNED_ ## DEPTH(array), XAI_ERR_ ## ERR, "The argument (" #array ") is not fully aligned") + +#define XAI_CHECK_TILE3D_IALIGNMENT_NX8(array) XAI_CHECK_TILE3D_ALIGNMENT(array, NX8, IALIGNMENT) +#define XAI_CHECK_TILE3D_IALIGNMENT_2NX8(array) XAI_CHECK_TILE3D_ALIGNMENT(array, 2NX8, IALIGNMENT) +#define XAI_CHECK_TILE3D_IALIGNMENT_NX16(array) XAI_CHECK_TILE3D_ALIGNMENT(array, NX16, IALIGNMENT) +#define XAI_CHECK_TILE3D_IALIGNMENT_N_2X32(array) XAI_CHECK_TILE3D_ALIGNMENT(array, N_2X32, IALIGNMENT) +#define XAI_CHECK_TILE3D_OALIGNMENT_NX8(array) XAI_CHECK_TILE3D_ALIGNMENT(array, NX8, OALIGNMENT) +#define XAI_CHECK_TILE3D_OALIGNMENT_2NX8(array) XAI_CHECK_TILE3D_ALIGNMENT(array, 2NX8, OALIGNMENT) +#define XAI_CHECK_TILE3D_OALIGNMENT_NX16(array) XAI_CHECK_TILE3D_ALIGNMENT(array, NX16, OALIGNMENT) +#define XAI_CHECK_TILE3D_OALIGNMENT_N_2X32(array) XAI_CHECK_TILE3D_ALIGNMENT(array, N_2X32, OALIGNMENT) +#define XAI_CHECK_TILE3D_CALIGNMENT_NX8(array) XAI_CHECK_TILE3D_ALIGNMENT(array, NX8, IALIGNMENT) +#define XAI_CHECK_TILE3D_CALIGNMENT_2NX8(array) XAI_CHECK_TILE3D_ALIGNMENT(array, 2NX8, IALIGNMENT) +#define XAI_CHECK_TILE3D_CALIGNMENT_NX16(array) XAI_CHECK_TILE3D_ALIGNMENT(array, NX16, IALIGNMENT) +#define XAI_CHECK_TILE3D_CALIGNMENT_N_2X32(array) XAI_CHECK_TILE3D_ALIGNMENT(array, N_2X32, IALIGNMENT) + +// check the minimal alignment requirements for 4D tile +#define XAI_TILE4D_IS_STRIDE_ALIGNED(t) ((XAI_TILE4D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE4D_IS_STRIDE_ALIGNED2(t) ((XAI_TILE4D_GET_DIM1_PITCH(t) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE4D_IS_STRIDE_ALIGNED_2(t) ((XAI_TILE4D_GET_DIM1_PITCH(t) & (XCHAL_IVPN_SIMD_WIDTH / 2 - 1)) == 0) +#define XAI_TILE4D_IS_PTR_ALIGNED_NX8(t) ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE4D_IS_PTR_ALIGNED_2NX8(t) ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE4D_IS_PTR_ALIGNED_NX16(t) ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) +#define XAI_TILE4D_IS_PTR_ALIGNED_N_2X32(t) ((XAI_PTR_TO_ADDR(XAI_TILE4D_GET_DATA_PTR(t)) & (2 * XCHAL_IVPN_SIMD_WIDTH - 1)) == 0) + +#define XAI_TILE4D_IS_ALIGNED_NX8(t) (XAI_TILE4D_IS_PTR_ALIGNED_NX8(t) && XAI_TILE4D_IS_STRIDE_ALIGNED(t)) +#define XAI_TILE4D_IS_ALIGNED_2NX8(t) (XAI_TILE4D_IS_PTR_ALIGNED_2NX8(t) && XAI_TILE4D_IS_STRIDE_ALIGNED2(t)) +#define XAI_TILE4D_IS_ALIGNED_NX16(t) (XAI_TILE4D_IS_PTR_ALIGNED_NX16(t) && XAI_TILE4D_IS_STRIDE_ALIGNED(t)) +#define XAI_TILE4D_IS_ALIGNED_N_2X32(t) (XAI_TILE4D_IS_PTR_ALIGNED_N_2X32(t) && XAI_TILE4D_IS_STRIDE_ALIGNED_2(t)) + +#define XAI_CHECK_TILE4D_ALIGNMENT(array, DEPTH, ERR) \ + XAI_CHECK_ERROR(XAI_TILE4D_IS_ALIGNED_ ## DEPTH(array), XAI_ERR_ ## ERR, "The argument (" #array ") is not fully aligned") + +#define XAI_CHECK_TILE4D_IALIGNMENT_NX8(array) XAI_CHECK_TILE4D_ALIGNMENT(array, NX8, IALIGNMENT) +#define XAI_CHECK_TILE4D_IALIGNMENT_2NX8(array) XAI_CHECK_TILE4D_ALIGNMENT(array, 2NX8, IALIGNMENT) +#define XAI_CHECK_TILE4D_IALIGNMENT_NX16(array) XAI_CHECK_TILE4D_ALIGNMENT(array, NX16, IALIGNMENT) +#define XAI_CHECK_TILE4D_IALIGNMENT_N_2X32(array) XAI_CHECK_TILE4D_ALIGNMENT(array, N_2X32, IALIGNMENT) +#define XAI_CHECK_TILE4D_OALIGNMENT_NX8(array) XAI_CHECK_TILE4D_ALIGNMENT(array, NX8, OALIGNMENT) +#define XAI_CHECK_TILE4D_OALIGNMENT_2NX8(array) XAI_CHECK_TILE4D_ALIGNMENT(array, 2NX8, OALIGNMENT) +#define XAI_CHECK_TILE4D_OALIGNMENT_NX16(array) XAI_CHECK_TILE4D_ALIGNMENT(array, NX16, OALIGNMENT) +#define XAI_CHECK_TILE4D_OALIGNMENT_N_2X32(array) XAI_CHECK_TILE4D_ALIGNMENT(array, N_2X32, OALIGNMENT) + +#define XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(tile0, tile1) XAI_CHECK_ARRAYS_ARE_NOT_OVERLAP(tile0, tile1) +#define XAI_CHECK_TILES4D_ARE_NOT_OVERLAP(tile0, tile1) XAI_CHECK_TILES3D_ARE_NOT_OVERLAP(tile0, tile1) + +#define XAI_CHECK_TILE3D_EQUAL(tile1, tile2) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_DIM1(tile1) == XAI_TILE3D_GET_DIM1(tile2) && \ + XAI_TILE3D_GET_DIM2(tile1) == XAI_TILE3D_GET_DIM2(tile2) && \ + XAI_TILE3D_GET_DIM3(tile1) == XAI_TILE3D_GET_DIM3(tile2), XAI_ERR_DATASIZE, \ + "Tiles sizes are not equal."); + +#define XAI_CHECK_TILE4D_EQUAL(tile1, tile2) \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_DIM1(tile1) == XAI_TILE4D_GET_DIM1(tile2) && \ + XAI_TILE4D_GET_DIM2(tile1) == XAI_TILE4D_GET_DIM2(tile2) && \ + XAI_TILE4D_GET_DIM3(tile1) == XAI_TILE4D_GET_DIM3(tile2) && \ + XAI_TILE4D_GET_DIM4(tile1) == XAI_TILE4D_GET_DIM4(tile2), XAI_ERR_DATASIZE, \ + "Tiles sizes are not equal."); + +#define XAI_CHECK_TILE3D_ELEMENT_SIZE_EQ(inT, outT) \ + XAI_CHECK_ERROR(XAI_TILE3D_GET_ELEMENT_SIZE(inT) == XAI_TILE3D_GET_ELEMENT_SIZE(outT), \ + XAI_ERR_DATATYPE, "Input tile element element size must be equal to output tile element size") + +#define XAI_CHECK_TILE4D_ELEMENT_SIZE_EQ(inT, outT) \ + XAI_CHECK_ERROR(XAI_TILE4D_GET_ELEMENT_SIZE(inT) == XAI_TILE4D_GET_ELEMENT_SIZE(outT), \ + XAI_ERR_DATATYPE, "Input tile element element size must be equal to output tile element size") + +#ifdef XAI_ERROR_CHECKS_RELAXED_REF +#undef XAI_CHECK_TILE4D_IALIGNMENT_2NX8 +#undef XAI_ARRAY_STARTS_IN_DRAM +#undef XAI_ARRAY_ENDS_IN_DRAM +#undef XAI_TILE2D_STARTS_IN_DRAM +#undef XAI_TILE2D_ENDS_IN_DRAM +#undef XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM +#undef XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM +#undef XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM +#undef XAI_ARRAYS_ARE_NOT_OVERLAP + +#define XAI_CHECK_TILE4D_IALIGNMENT_2NX8(array) +#define XAI_ARRAY_STARTS_IN_DRAM(t) 1 +#define XAI_ARRAY_ENDS_IN_DRAM(t) 1 +#define XAI_TILE2D_STARTS_IN_DRAM(t) 1 +#define XAI_TILE2D_ENDS_IN_DRAM(t) 1 +#define XAI_TILE3D_START_AND_END_IN_SINGLE_DRAM(t) 1 +#define XAI_TILE4D_START_AND_END_IN_SINGLE_DRAM(t) 1 +#define XAI_ARRAY_START_AND_END_IN_SINGLE_DRAM(t) 1 +#define XAI_ARRAYS_ARE_NOT_OVERLAP(t1, t2) 1 +#endif + +#if defined SYS_MEM_TESTING || defined XAI_ERROR_CHECKS_RELAXED_REF +#undef XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY +#define XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(tile) +#endif + +// other macros +#define XAI_TO_Q15(val) ((int16_t) ((val) * (1 << 15) + 0.5)) +#define XAI_TO_Q1_14(val) ((int16_t) ((val) * (1 << 14) + 0.5)) +#define XAI_TO_Q2_13(val) ((int16_t) ((val) * (1 << 13) + 0.5)) +#define XAI_TO_Q3_12(val) ((int16_t) ((val) * (1 << 12) + 0.5)) +#define XAI_TO_Q4_11(val) ((int16_t) ((val) * (1 << 11) + 0.5)) +#define XAI_TO_Q5_10(val) ((int16_t) ((val) * (1 << 10) + 0.5)) +#define XAI_TO_Q13_18(val) ((int) ((val) * (1 << 18) + 0.5)) +#define XAI_Q0_16_HALF 0x8000 +#endif diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h b/backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h new file mode 100644 index 00000000000..894ae20b9d7 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_core_api.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __XAI_CORE_API_H__ +#define __XAI_CORE_API_H__ + +#include +#include +#include + +#include "xai_config_api.h" +#include "xai_tile_manager.h" + +/* library information */ +// _XAI_API_ is defined in glow/externalbackends/Xtensa/Backends/libxai/libxai.h and xtensa-mlir-dialect/include/xtensa/Conversion/xaicnn.h +// They dont use _XAI_API_ from xaicnn/libxai/include/xai_config_api.h and hence they dont get _XAI_API_VAR_ +// defining _XAI_API_VAR_ for those cases. + +#ifndef _XAI_API_VAR_ +#define _XAI_API_VAR_ _XAI_API_ +#endif + +_XAI_API_VAR_ char XAI_BUILD_CONFIGURATION[]; +_XAI_API_VAR_ char XAI_BUILD_TOOLS_VERSION[]; +_XAI_API_VAR_ char XAI_BUILD_CORE_ID[]; +_XAI_API_VAR_ char XAI_BUILD_ERROR_LEVEL[]; +_XAI_API_VAR_ char XAI_BUILD_FEATURES_STR[]; + +/* Math constants */ + +#define XAI_PI 3.14159265358979323846 +#define XAI_PI_F 3.14159265358979323846f + +/* IVP library data types */ + +typedef int32_t XAI_ERR_TYPE; +typedef uint8_t xai_bool; + +typedef int16_t XAI_Q0_15; +typedef int16_t XAI_Q5_10; +typedef int16_t XAI_Q6_9; +typedef int16_t XAI_Q7_8; +typedef int16_t XAI_Q8_7; +typedef int16_t XAI_Q12_3; +typedef int16_t XAI_Q13_2; + +typedef int32_t XAI_Q0_31; +typedef int32_t XAI_Q1_30; +typedef int32_t XAI_Q12_19; +typedef int32_t XAI_Q13_18; +typedef int32_t XAI_Q15_16; +typedef int32_t XAI_Q16_15; +typedef int32_t XAI_Q22_9; +typedef int32_t XAI_Q28_3; + +typedef XAI_Q0_15 XAI_Q15; +typedef uint16_t XAI_Q0_16; + + +typedef struct +{ + int16_t x; + int16_t y; +} xai_point; + +typedef struct +{ + int32_t x; + int32_t y; +} xai_point32; + +typedef struct +{ + XAI_Q16_15 x; + XAI_Q16_15 y; +} xai_point_fpt; + +typedef struct +{ + float x; + float y; +} xai_point_f; + +typedef struct +{ + int32_t width; + int32_t height; +} xai_size; + +typedef struct +{ + float a11; + float a12; + float a21; + float a22; + float xt; + float yt; +} xai_affine; + +typedef struct +{ + XAI_Q13_18 a11; + XAI_Q13_18 a12; + XAI_Q13_18 a21; + XAI_Q13_18 a22; + XAI_Q13_18 xt; + XAI_Q13_18 yt; +} xai_affine_fpt; + +typedef struct +{ + float a11; + float a12; + float a13; + float a21; + float a22; + float a23; + float a31; + float a32; + float a33; +} xai_perspective; + +typedef struct +{ + XAI_Q13_18 a11; + XAI_Q13_18 a12; + XAI_Q13_18 a13; + XAI_Q13_18 a21; + XAI_Q13_18 a22; + XAI_Q13_18 a23; + XAI_Q13_18 a31; + XAI_Q13_18 a32; + XAI_Q13_18 a33; +} xai_perspective_fpt; + +typedef struct +{ + int16_t x; + int16_t y; + uint16_t width; + uint16_t height; +} xai_rect; + +typedef struct +{ + int16_t x; + int16_t y; + uint16_t width; + uint16_t height; + XAI_Q5_10 angle; +} xai_rotated_rect; + +typedef struct +{ + float x; + float y; + float width; + float height; + float angle; +} xai_rotated_rect_f; + +typedef struct +{ + int32_t M00; + int64_t M10; + int64_t M01; + int64_t M11; + int64_t M20; + int64_t M02; +} xai_moments; + +typedef struct +{ + XAI_Q13_18 rho; + XAI_Q13_18 theta; +} xai_line_polar_fpt; + +typedef struct +{ + uint32_t size; // number of pyramid levels + float scale; + xai_tile2D **levels; // array of pyramid levels +} xai_pyramid, *xai_pPyramid; +#define XAI_HAS_PYRAMID 1 + + +/* Error codes */ + +#define XAI_ERR_OK 0 // no error +#define XAI_ERR_IALIGNMENT 1 // input alignment requirements are not satisfied +#define XAI_ERR_OALIGNMENT 2 // output alignment requirements are not satisfied +#define XAI_ERR_MALIGNMENT 3 // same modulo alignment requirement is not satisfied +#define XAI_ERR_BADARG 4 // arguments are somehow invalid +#define XAI_ERR_MEMLOCAL 5 // tile is not placed in local memory +#define XAI_ERR_INPLACE 6 // inplace operation is not supported +#define XAI_ERR_EDGE 7 // edge extension size is too small +#define XAI_ERR_DATASIZE 8 // input/output tile size is too small or too big or otherwise inconsistent +#define XAI_ERR_TMPSIZE 9 // temporary tile size is too small or otherwise inconsistent +#define XAI_ERR_KSIZE 10 // filer kernel size is not supported +#define XAI_ERR_NORM 11 // invalid normalization divisor or shift value +#define XAI_ERR_COORD 12 // invalid coordinates +#define XAI_ERR_BADTRANSFORM 13 // the transform is singular or otherwise invalid +#define XAI_ERR_NULLARG 14 // one of required arguments is null +#define XAI_ERR_THRESH_INVALID 15 // threshold value is somehow invalid +#define XAI_ERR_SCALE 16 // provided scale factor is not supported +#define XAI_ERR_OVERFLOW 17 // tile size can lead to sum overflow +#define XAI_ERR_NOTIMPLEMENTED 18 // the requested functionality is absent in current version +#define XAI_ERR_CHANNEL_INVALID 19 // invalid channel number +#define XAI_ERR_DATATYPE 20 // argument has invalid data type +#define XAI_ERR_NO_VARIANT 21 // No suitable variant found for the function +#define XAI_ERR_PTR_NULL 22 // Pointer is NULL +#define XAI_ERR_CUSTOMACC_PREPARE 23 // fails to prepare the custom acc hardware +#define XAI_ERR_CUSTOMACC_EXECUTE 24 // fails to execute ops on the custom acc hardware +#define XAI_ERR_CUSTOMACC_REMOVE 25 // fails to remove a network for the custom acc hardware +#define XAI_ERR_LAST 25 + +/* non-fatal errors */ + +#define XAI_ERR_POOR_DECOMPOSITION 1024 // computed transform decomposition can produce visual artifacts +#define XAI_ERR_OUTOFTILE 1025 // arguments or results are out of tile +#define XAI_ERR_OBJECTLOST 1026 // tracked object is lost +#define XAI_ERR_RANSAC_NOTFOUND 1027 // there is no found appropriate model for RANSAC +#define XAI_ERR_REPLAY 1028 // function has to be called again for completion + + +/* helper macro */ + +#ifdef XCHAL_IVPN_SIMD_WIDTH +# define XAI_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH +#else +# define XAI_SIMD_WIDTH 32 +#endif + +#define XAI_SIZE_AREA(sz) ((size_t) sz.width * sz.height) +#define XAI_ALIGN_VAL(val, pow2) (((val) + ((pow2) - 1)) & ~((pow2) - 1)) +#define XAI_ALIGN_VALN(val) XAI_ALIGN_VAL(val, XAI_SIMD_WIDTH) + +#define XAI_PTR_TO_ADDR(ptr) ((uintptr_t) (ptr)) +#define XAI_ALIGN_PTR(ptr, alignment) ((void *) XAI_ALIGN_VAL(XAI_PTR_TO_ADDR((ptr)), (alignment))) + +/* temporary space requirement for xaiSort */ +#if XCHAL_HAVE_GRIVPEP_HISTOGRAM || XCHAL_HAVE_VISION_HISTOGRAM +# define XAI_SORT_TMP_SIZE 0 // use vector registers only +#elif XCHAL_HAVE_VISION +# define XAI_SORT_TMP_SIZE (XAI_SIMD_WIDTH * 256 + XAI_SIMD_WIDTH) // SIMD_WIDTH histograms by 256 bins + 32 for pointer alignment inside optimized function +#else +# define XAI_SORT_TMP_SIZE (2 * 256 + XAI_SIMD_WIDTH) // 3 histograms by 256 bins + 32 for pointer alignment inside optimized function +#endif + + +/* error code to text conversion */ +_XAI_API_ const char* xaiErrStr(XAI_ERR_TYPE code); +#endif diff --git a/backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h b/backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h new file mode 100644 index 00000000000..71227adbf34 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/include/xai_tile_manager.h @@ -0,0 +1,1246 @@ +/* + * Copyright (c) 2013-2018 Tensilica Inc. ALL RIGHTS RESERVED. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef __XAI_TILE_MANAGER_H__ +#define __XAI_TILE_MANAGER_H__ + +#include +#include +#include "xai_config_api.h" + +typedef struct xaiFrameStruct +{ + void *pFrameBuff; + uint32_t frameBuffSize; + void *pFrameData; + int32_t frameWidth; + int32_t frameHeight; + int32_t framePitch; + uint8_t pixelRes; + uint8_t pixelPackFormat; +} xai_frame, *xai_pFrame; + +#define XAI_ARRAY_FIELDS \ + void *pBuffer; \ + void *pData; \ + uint32_t bufferSize; \ + int32_t width; \ + int32_t pitch; \ + uint32_t status; \ + uint16_t type; \ + int32_t height; + +typedef struct xaiArrayStruct +{ + XAI_ARRAY_FIELDS +} xai_array, *xai_pArray; + +#define XAI_ARRAY_FIELDS_COEFF_32 \ + uintptr_t pBuffer; \ + uintptr_t pData; \ + uint64_t bufferSize; \ + uint64_t width; \ + int64_t pitch; \ + uint32_t status; \ + uint16_t type; \ + int32_t height; + +typedef struct xaiArrayStruct_coeff_32 +{ + XAI_ARRAY_FIELDS_COEFF_32 +} xai_array_coeff_32, *xai_pArray_coeff_32; + +#define XAI_ARRAY_FIELDS_COEFF_64 \ + uint64_t pBuffer; \ + uint64_t pData; \ + uint64_t bufferSize; \ + uint64_t width; \ + int64_t pitch; \ + uint32_t status; \ + uint16_t type; \ + int32_t height; + +typedef struct xaiArrayStruct_coeff_64 +{ + XAI_ARRAY_FIELDS_COEFF_64 +} xai_array_coeff_64, *xai_pArray_coeff_64; + +typedef struct xaiTile2DStruct +{ + XAI_ARRAY_FIELDS + xai_frame *pFrame; + int32_t x; + int32_t y; + uint16_t edgeWidth; + uint16_t edgeHeight; +} xai_tile2D, *xai_pTile2D; + +/***************************************** +* Data type definitions +*****************************************/ + +//** 16 bit data type, bit 0 - 3 for data encoded depth(bits/bytes), 5 - 7 free bits (reserved for future use), bit 8 - 10 for encoded special type float +//** 11 bit for float (denotes whether float or not), 12 - 14 bit for encoded tile type, and 15 bit for data sign + +#define XAI_TYPE_SIGNED_BIT (1 << 15) + +#define XAI_TYPE_ARRAY_BITS (1 << 12) +#define XAI_TYPE_TILE2D_BITS (2 << 12) +#define XAI_TYPE_TILE3D_BITS (3 << 12) +#define XAI_TYPE_TILE4D_BITS (4 << 12) +#define XAI_TYPE_TILE5D_BITS (5 << 12) +#define XAI_TYPE_TILE6D_BITS (6 << 12) +#define XAI_TYPE_TILE_BITS 3 +#define XAI_TYPE_TILE_MASK (((1 << XAI_TYPE_TILE_BITS) - 1) << 12) + +#define XAI_TYPE_FLOAT_BIT (1 << 11) +#define XAI_TYPE_SPECIAL_FLOAT_BITS 3 +#define XAI_TYPE_SPECIAL_FLOAT_MASK (((1 << XAI_TYPE_SPECIAL_FLOAT_BITS) - 1) << 8) +#define XAI_TYPE_BFLOAT_BIT (XAI_TYPE_FLOAT_BIT | (1 << 8)) + +#define XAI_TYPE_ELEMENT_SIZE_BITS 4 +#define XAI_TYPE_ELEMENT_SIZE_MASK ((1 << XAI_TYPE_ELEMENT_SIZE_BITS) - 1) + +#define XAI_MAKETYPE(flags, depth) ((flags) | (depth)) +#define XAI_CUSTOMTYPE(type) XAI_MAKETYPE(0, (sizeof(type) + 2)) //convert byte to representation sequence + +#define XAI_TYPE_ELEMENT_SIZE_IN_BYTES(type) (1 << (((type) & (XAI_TYPE_ELEMENT_SIZE_MASK)) - 3)) +#define XAI_TYPE_ELEMENT_SIZE_IN_BITS(type) (XAI_TYPE_ELEMENT_SIZE_IN_BYTES(type) << 3) +#define XAI_TYPE_ELEMENT_SIZE(type) XAI_TYPE_ELEMENT_SIZE_IN_BYTES(type) +#define XAI_TYPE_ELEMENT_TYPE(type) ((type) & (XAI_TYPE_SIGNED_BIT | XAI_TYPE_ELEMENT_SIZE_MASK | XAI_TYPE_FLOAT_BIT | XAI_TYPE_SPECIAL_FLOAT_MASK)) +#define XAI_TYPE_IS_ARRAY(type) (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_ARRAY_BITS)) +#define XAI_TYPE_IS_TILE2D(type) (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE2D_BITS)) +#define XAI_TYPE_IS_SIGNED(type) ((type) & (XAI_TYPE_SIGNED_BIT)) + +// XAI_MAKETYPE accepts 2 parameters +// 1: Denotes whether the entity is a tile(XAI_TYPE_TILE2D_BITS, XAI_TYPE_TILE3D_BITS etc. flag set) or an array(XAI_TYPE_ARRAY_BITS flag set) , +// ,if the data is a signed or unsigned(XAI_TYPE_SIGNED_BIT) and also if data is float(XAI_TYPE_FLOAT_BIT) and float type(XAI_TYPE_BFLOAT_BIT etc.) +// 2: Denotes encoded number of bits/bytes +// 0 implies the data is bool, 1 implies the data is 2 bit, 2 implies the data is 4bit, 3 implies the data is 8bit, 4 implies the data is 16bit. +// 5 implies the data is 32bit, 6 implies the data is 64bit and 7 implies the data is 128bit + +#define XAI_BOOL XAI_MAKETYPE(0, 0) +#define XAI_U2 XAI_MAKETYPE(0, 1) +#define XAI_U4 XAI_MAKETYPE(0, 2) +#define XAI_U8 XAI_MAKETYPE(0, 3) +#define XAI_U16 XAI_MAKETYPE(0, 4) +#define XAI_U32 XAI_MAKETYPE(0, 5) +#define XAI_U64 XAI_MAKETYPE(0, 6) +#define XAI_U128 XAI_MAKETYPE(0, 7) + +#define XAI_S2 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 1) +#define XAI_S4 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 2) +#define XAI_S8 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 3) +#define XAI_S16 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 4) +#define XAI_S32 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 5) +#define XAI_S64 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 6) +#define XAI_S128 XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT, 7) + +#define XAI_F8 (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 3)) +#define XAI_F16 (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 4)) +#define XAI_F32 (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 5)) +#define XAI_F64 (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 6)) +#define XAI_F128 (XAI_MAKETYPE(XAI_TYPE_SIGNED_BIT | XAI_TYPE_FLOAT_BIT, 7)) + +#define XAI_ARRAY_BOOL (XAI_BOOL | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_U4 (XAI_U4 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_U8 (XAI_U8 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_U16 (XAI_U16 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_U32 (XAI_U32 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_U64 (XAI_U64 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_U128 (XAI_U128 | XAI_TYPE_ARRAY_BITS) + +#define XAI_ARRAY_S4 (XAI_S4 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_S8 (XAI_S8 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_S16 (XAI_S16 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_S32 (XAI_S32 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_S64 (XAI_S64 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_S128 (XAI_S128 | XAI_TYPE_ARRAY_BITS) + +#define XAI_ARRAY_F8 (XAI_F8 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_F16 (XAI_F16 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_F32 (XAI_F32 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_F64 (XAI_F64 | XAI_TYPE_ARRAY_BITS) +#define XAI_ARRAY_F128 (XAI_F128 | XAI_TYPE_ARRAY_BITS) + +#define XAI_TILE2D_BOOL (XAI_BOOL | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_U4 (XAI_U4 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_U8 (XAI_U8 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_U16 (XAI_U16 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_U32 (XAI_U32 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_U64 (XAI_U64 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_U128 (XAI_U128 | XAI_TYPE_TILE2D_BITS) + +#define XAI_TILE2D_S4 (XAI_S4 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_S8 (XAI_S8 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_S16 (XAI_S16 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_S32 (XAI_S32 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_S64 (XAI_S64 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_S128 (XAI_S128 | XAI_TYPE_TILE2D_BITS) + +#define XAI_TILE2D_F8 (XAI_F8 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_F16 (XAI_F16 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_F32 (XAI_F32 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_F64 (XAI_F64 | XAI_TYPE_TILE2D_BITS) +#define XAI_TILE2D_F128 (XAI_F128 | XAI_TYPE_TILE2D_BITS) + +/***************************************** +* Frame Access Macros +*****************************************/ +#define XAI_FRAME_GET_BUFF_PTR(pFrame) ((pFrame)->pFrameBuff) +#define XAI_FRAME_SET_BUFF_PTR(pFrame, pBuff) (pFrame)->pFrameBuff = ((void *) (pBuff)) + +#define XAI_FRAME_GET_BUFF_SIZE(pFrame) ((pFrame)->frameBuffSize) +#define XAI_FRAME_SET_BUFF_SIZE(pFrame, buffSize) (pFrame)->frameBuffSize = ((uint32_t) (buffSize)) + +#define XAI_FRAME_GET_DATA_PTR(pFrame) ((pFrame)->pFrameData) +#define XAI_FRAME_SET_DATA_PTR(pFrame, pData) (pFrame)->pFrameData = ((void *) (pData)) + +#define XAI_FRAME_GET_WIDTH(pFrame) ((pFrame)->frameWidth) +#define XAI_FRAME_SET_WIDTH(pFrame, width) (pFrame)->frameWidth = ((int32_t) (width)) + +#define XAI_FRAME_GET_HEIGHT(pFrame) ((pFrame)->frameHeight) +#define XAI_FRAME_SET_HEIGHT(pFrame, height) (pFrame)->frameHeight = ((int32_t) (height)) + +#define XAI_FRAME_GET_PITCH(pFrame) ((pFrame)->framePitch) +#define XAI_FRAME_SET_PITCH(pFrame, pitch) (pFrame)->framePitch = ((int32_t) (pitch)) + +#define XAI_FRAME_GET_PIXEL_RES(pFrame) ((pFrame)->pixelRes) +#define XAI_FRAME_SET_PIXEL_RES(pFrame, pixRes) (pFrame)->pixelRes = ((uint8_t) (pixRes)) + +#define XAI_FRAME_GET_PIXEL_FORMAT(pFrame) ((pFrame)->pixelPackFormat) +#define XAI_FRAME_SET_PIXEL_FORMAT(pFrame, pixelFormat) (pFrame)->pixelPackFormat = ((uint8_t) (pixelFormat)) + +/***************************************** +* Array Access Macros +*****************************************/ +#define XAI_ARRAY_GET_BUFF_PTR(pArray) ((pArray)->pBuffer) +#define XAI_ARRAY_SET_BUFF_PTR(pArray, pBuff) (pArray)->pBuffer = ((void *) (pBuff)) + +#define XAI_ARRAY_GET_BUFF_SIZE(pArray) ((pArray)->bufferSize) +#define XAI_ARRAY_SET_BUFF_SIZE(pArray, buffSize) (pArray)->bufferSize = (buffSize) + +#define XAI_ARRAY_GET_DATA_PTR(pArray) ((pArray)->pData) +#define XAI_ARRAY_SET_DATA_PTR(pArray, pArrayData) (pArray)->pData = ((void *) (pArrayData)) + +#define XAI_ARRAY_SET_BUFF_PTR_COEFF(pArray, pBuff) (pArray)->pBuffer = ((uint64_t) (pBuff)) +#define XAI_ARRAY_SET_DATA_PTR_COEFF(pArray, pArrayData) (pArray)->pData = ((uint64_t) (pArrayData)) + +#define XAI_ARRAY_GET_WIDTH(pArray) ((pArray)->width) +#define XAI_ARRAY_SET_WIDTH(pArray, value) (pArray)->width = ((int32_t) (value)) +#define XAI_ARRAY_SET_WIDTH_COEFF(pArray, value) (pArray)->width = ((uint64_t) (value)) + +#define XAI_ARRAY_GET_PITCH(pArray) ((pArray)->pitch) +#define XAI_ARRAY_SET_PITCH(pArray, value) (pArray)->pitch = ((int32_t) (value)) + +#define XAI_ARRAY_GET_HEIGHT(pArray) ((pArray)->height) +#define XAI_ARRAY_SET_HEIGHT(pArray, value) (pArray)->height = ((uint16_t) (value)) + +#define XAI_ARRAY_GET_STATUS_FLAGS(pArray) ((pArray)->status) +#define XAI_ARRAY_SET_STATUS_FLAGS(pArray, value) (pArray)->status = ((uint8_t) (value)) + +#define XAI_ARRAY_GET_TYPE(pArray) ((pArray)->type) +#define XAI_ARRAY_SET_TYPE(pArray, value) (pArray)->type = ((uint16_t) (value)) + +#define XAI_ARRAY_GET_CAPACITY(pArray) ((pArray)->pitch) +#define XAI_ARRAY_SET_CAPACITY(pArray, value) (pArray)->pitch = ((int32_t) (value)) +#define XAI_ARRAY_SET_CAPACITY_COEFF(pArray, value) (pArray)->pitch = ((int64_t) (value)) + +#define XAI_ARRAY_GET_ELEMENT_TYPE(pArray) (XAI_TYPE_ELEMENT_TYPE(XAI_ARRAY_GET_TYPE(pArray))) +#define XAI_ARRAY_GET_ELEMENT_SIZE(pArray) (XAI_TYPE_ELEMENT_SIZE(XAI_ARRAY_GET_TYPE(pArray))) +#define XAI_ARRAY_IS_TILE2D(pArray) (!(((XAI_ARRAY_GET_TYPE(pArray)) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE2D_BITS)) + +#define XAI_ARRAY_GET_AREA(pArray) (((pArray)->width) * ((int32_t) (pArray)->height)) + +/***************************************** +* Tile Access Macros +*****************************************/ +#define XAI_TILE2D_GET_BUFF_PTR XAI_ARRAY_GET_BUFF_PTR +#define XAI_TILE2D_SET_BUFF_PTR XAI_ARRAY_SET_BUFF_PTR + +#define XAI_TILE2D_GET_BUFF_SIZE XAI_ARRAY_GET_BUFF_SIZE +#define XAI_TILE2D_SET_BUFF_SIZE XAI_ARRAY_SET_BUFF_SIZE + +#define XAI_TILE2D_GET_DATA_PTR XAI_ARRAY_GET_DATA_PTR +#define XAI_TILE2D_SET_DATA_PTR XAI_ARRAY_SET_DATA_PTR + +#define XAI_TILE2D_GET_WIDTH XAI_ARRAY_GET_WIDTH +#define XAI_TILE2D_SET_WIDTH XAI_ARRAY_SET_WIDTH + +#define XAI_TILE2D_GET_PITCH XAI_ARRAY_GET_PITCH +#define XAI_TILE2D_SET_PITCH XAI_ARRAY_SET_PITCH + +#define XAI_TILE2D_GET_HEIGHT XAI_ARRAY_GET_HEIGHT +#define XAI_TILE2D_SET_HEIGHT XAI_ARRAY_SET_HEIGHT + +#define XAI_TILE2D_GET_STATUS_FLAGS XAI_ARRAY_GET_STATUS_FLAGS +#define XAI_TILE2D_SET_STATUS_FLAGS XAI_ARRAY_SET_STATUS_FLAGS + +#define XAI_TILE2D_GET_TYPE XAI_ARRAY_GET_TYPE +#define XAI_TILE2D_SET_TYPE XAI_ARRAY_SET_TYPE + +#define XAI_TILE2D_GET_ELEMENT_TYPE XAI_ARRAY_GET_ELEMENT_TYPE +#define XAI_TILE2D_GET_ELEMENT_SIZE XAI_ARRAY_GET_ELEMENT_SIZE +#define XAI_TILE2D_IS_TILE2D XAI_ARRAY_IS_TILE2D + +#define XAI_TILE2D_GET_FRAME_PTR(pTile) ((pTile)->pFrame) +#define XAI_TILE2D_SET_FRAME_PTR(pTile, ptrFrame) (pTile)->pFrame = ((xai_frame *) (ptrFrame)) + +#define XAI_TILE2D_GET_X_COORD(pTile) ((pTile)->x) +#define XAI_TILE2D_SET_X_COORD(pTile, xcoord) (pTile)->x = ((int32_t) (xcoord)) + +#define XAI_TILE2D_GET_Y_COORD(pTile) ((pTile)->y) +#define XAI_TILE2D_SET_Y_COORD(pTile, ycoord) (pTile)->y = ((int32_t) (ycoord)) + +#define XAI_TILE2D_GET_EDGE_WIDTH(pTile) ((pTile)->edgeWidth) +#define XAI_TILE2D_SET_EDGE_WIDTH(pTile, eWidth) ((pTile)->edgeWidth = (uint16_t) eWidth) + +#define XAI_TILE2D_GET_EDGE_HEIGHT(pTile) ((pTile)->edgeHeight) +#define XAI_TILE2D_SET_EDGE_HEIGHT(pTile, eHeight) ((pTile)->edgeHeight = (uint16_t) eHeight) + +/*********************************** +* Other Marcos +***********************************/ +#define XAI_TILE2D_CHECK_VIRTUAL_FRAME(pTile) ((pTile)->pFrame->pFrameBuff == NULL) +#define XAI_FRAME_CHECK_VIRTUAL_FRAME(pFrame) ((pFrame)->pFrameBuff == NULL) + +typedef enum { XAI_WHD, XAI_DWH, XAI_ID4WH, XAI_ID16WH, XAI_ID32WH, XAI_WHDN, XAI_NWHD, XAI_NDWH, XAI_DWHN, XAI_IN64DWH, XAI_IN32DWH, XAI_RMOD, XAI_IN16DWH, XAI_MTILE, XAI_CMTILE, XAI_RMOD_DWH_ID16WH, XAI_RMOD_InOutDepth32X, XAI_RMOD_ID4WH, XAI_ID16WHN, XAI_ID32WHN, XAI_IN128DWH, XAI_RMOD_DWH_I16_ID16WH, XAI_RMOD_ID16WH, XAI_RMOD_InOutDepth64X, XAI_UNKNOWN } xai_cnn_data_order; + +/****************************************************************************************************************** +* +* 3D definitions - extension of 2D definitions +* +* ****************************************************************************************************************/ +typedef struct xai_frame3DStruct +{ + void *pFrameBuff; + uint32_t frameBuffSize; + void *pFrameData; + int32_t dim1Size; + int32_t dim2Size; + int32_t dim1Pitch; // pitch in width dimension + uint8_t pixelRes; // in bits + uint8_t pixelPackFormat; // not used in XI library + uint16_t dim1Edge1; + uint16_t dim1Edge2; + uint16_t dim2Edge1; + uint16_t dim2Edge2; + uint16_t dim3Edge1; + uint16_t dim3Edge2; + uint8_t paddingType; + // new fields + int32_t dim2Pitch; + int32_t dim3Size; + xai_cnn_data_order dataOrder; // WHD, DWH, etc. +} xai_frame3D, *xai_pFrame3D; + +// new access macros +#define XAI_FRAME3D_GET_DIM1(x) ((x)->dim1Size) +#define XAI_FRAME3D_SET_DIM1(x, v) ((x)->dim1Size = (v)) +#define XAI_FRAME3D_GET_DIM1_PITCH(x) ((x)->dim1Pitch) +#define XAI_FRAME3D_SET_DIM1_PITCH(x, v) ((x)->dim1Pitch = (v)) +#define XAI_FRAME3D_GET_DIM1_PITCH_IN_BYTES(x) ((x)->dim1Pitch * ((x)->pixelRes / 8 + ((x)->pixelRes & 7 != 0))) +#define XAI_FRAME3D_GET_DIM2(x) ((x)->dim2Size) +#define XAI_FRAME3D_SET_DIM2(x, v) ((x)->dim2Size = (v)) +#define XAI_FRAME3D_GET_DIM2_PITCH(x) ((x)->dim2Pitch) +#define XAI_FRAME3D_SET_DIM2_PITCH(x, v) ((x)->dim2Pitch = (v)) +#define XAI_FRAME3D_GET_DIM2_PITCH_IN_BYTES(x) ((x)->dim2Pitch * ((x)->pixelRes / 8 + ((x)->pixelRes & 7 != 0))) +#define XAI_FRAME3D_GET_DIM3(x) ((x)->dim3Size) +#define XAI_FRAME3D_SET_DIM3(x, v) ((x)->dim3Size = (v)) +#define XAI_FRAME3D_GET_DIM1_EDGE1(x) ((x)->dim1Edge1) +#define XAI_FRAME3D_SET_DIM1_EDGE1(x, v) ((x)->dim1Edge1 = (v)) +#define XAI_FRAME3D_GET_DIM1_EDGE2(x) ((x)->dim1Edge2) +#define XAI_FRAME3D_SET_DIM1_EDGE2(x, v) ((x)->dim1Edge2 = (v)) +#define XAI_FRAME3D_GET_DIM2_EDGE1(x) ((x)->dim2Edge1) +#define XAI_FRAME3D_SET_DIM2_EDGE1(x, v) ((x)->dim2Edge1 = (v)) +#define XAI_FRAME3D_GET_DIM2_EDGE2(x) ((x)->dim2Edge2) +#define XAI_FRAME3D_SET_DIM2_EDGE2(x, v) ((x)->dim2Edge2 = (v)) +#define XAI_FRAME3D_GET_DIM3_EDGE1(x) ((x)->dim3Edge1) +#define XAI_FRAME3D_SET_DIM3_EDGE1(x, v) ((x)->dim3Edge1 = (v)) +#define XAI_FRAME3D_GET_DIM3_EDGE2(x) ((x)->dim3Edge2) +#define XAI_FRAME3D_SET_DIM3_EDGE2(x, v) ((x)->dim3Edge2 = (v)) +#define XAI_FRAME3D_GET_DATA_ORDER(x) ((x)->dataOrder) +#define XAI_FRAME3D_SET_DATA_ORDER(x, v) ((x)->dataOrder = (v)) + +typedef struct +{ + int32_t dim1Size; + int32_t dim2Size; + int32_t dim3Size; +} xai_size3D; + +typedef struct +{ + int32_t dim1Size; + int32_t dim2Size; + int32_t dim3Size; + int32_t dim4Size; +} xai_size4D; + +typedef struct +{ + uint16_t dim1Edge1; + uint16_t dim1Edge2; + uint16_t dim2Edge1; + uint16_t dim2Edge2; + uint16_t dim3Edge1; + uint16_t dim3Edge2; +} xai_edge3D; + +typedef struct +{ + int32_t dataType; +} xai_dataType; + +// 3D tile +#define XAI_TILE3D_FIELDS \ + uint32_t bufferSize; \ + int32_t dim1Size; \ + int32_t dim1Pitch; \ + uint32_t status; /* Currently not used, planned to be obsolete */ \ + uint16_t type; \ + int32_t dim2Size; \ + xai_frame3D *pFrame; /* changed to 3D frame */ \ + int32_t dim1Loc; /* dim1-loc of top-left active pixel in src frame */ \ + int32_t dim2Loc; /* dim2-loc of top-left active pixel in src frame */ \ + uint16_t dim1Edge1; \ + uint16_t dim2Edge1; \ + uint16_t dim1Edge2; \ + uint16_t dim2Edge2; \ + /* new fields */ \ + int32_t dim2Pitch; \ + int32_t dim3Size; \ + xai_cnn_data_order dataOrder; \ + int32_t dim3Loc; /* dim3-loc of top-left active pixel in src frame */ \ + uint16_t dim3Edge1; \ + uint16_t dim3Edge2; \ + /* Number of PTILES in a MEMTILE along a particular dimension. Used for MEMTILES only */ \ + int16_t numPtilesDim1; \ + int16_t numPtilesDim2; \ + int16_t numPtilesDim3; + +typedef struct xai_tile3DStruct +{ + void *pBuffer; + void *pData; + XAI_TILE3D_FIELDS +} xai_tile3D, *xai_pTile3D; + +typedef struct xai_tile3DStruct_64 +{ + uint64_t pBuffer; + uint64_t pData; + XAI_TILE3D_FIELDS +} xai_tile3D_64, *xai_pTile3D_64; + +#define XAI_TILE3D_GET_DIM1(x) ((x)->dim1Size) +#define XAI_TILE3D_SET_DIM1(x, v) ((x)->dim1Size = (v)) +#define XAI_TILE3D_GET_DIM1_PITCH(x) ((x)->dim1Pitch) +#define XAI_TILE3D_SET_DIM1_PITCH(x, v) ((x)->dim1Pitch = (v)) +#define XAI_TILE3D_GET_DIM2(x) ((x)->dim2Size) +#define XAI_TILE3D_SET_DIM2(x, v) ((x)->dim2Size = (v)) +#define XAI_TILE3D_GET_DIM2_PITCH(x) ((x)->dim2Pitch) +#define XAI_TILE3D_SET_DIM2_PITCH(x, v) ((x)->dim2Pitch = (v)) +#define XAI_TILE3D_GET_DIM3(x) ((x)->dim3Size) +#define XAI_TILE3D_SET_DIM3(x, v) ((x)->dim3Size = (v)) +#define XAI_TILE3D_GET_DATA_ORDER(x) ((x)->dataOrder) +#define XAI_TILE3D_SET_DATA_ORDER(x, v) ((x)->dataOrder = (v)) +#define XAI_TILE3D_GET_DIM1_COORD(x) ((x)->dim1Loc) +#define XAI_TILE3D_SET_DIM1_COORD(x, v) ((x)->dim1Loc = (v)) +#define XAI_TILE3D_GET_DIM2_COORD(x) ((x)->dim2Loc) +#define XAI_TILE3D_SET_DIM2_COORD(x, v) ((x)->dim2Loc = (v)) +#define XAI_TILE3D_GET_DIM3_COORD(x) ((x)->dim3Loc) +#define XAI_TILE3D_SET_DIM3_COORD(x, v) ((x)->dim3Loc = (v)) +#define XAI_TILE3D_GET_DIM1_EDGE1(x) ((x)->dim1Edge1) +#define XAI_TILE3D_SET_DIM1_EDGE1(x, v) ((x)->dim1Edge1 = (v)) +#define XAI_TILE3D_GET_DIM1_EDGE2(x) ((x)->dim1Edge2) +#define XAI_TILE3D_SET_DIM1_EDGE2(x, v) ((x)->dim1Edge2 = (v)) +#define XAI_TILE3D_GET_DIM2_EDGE1(x) ((x)->dim2Edge1) +#define XAI_TILE3D_SET_DIM2_EDGE1(x, v) ((x)->dim2Edge1 = (v)) +#define XAI_TILE3D_GET_DIM2_EDGE2(x) ((x)->dim2Edge2) +#define XAI_TILE3D_SET_DIM2_EDGE2(x, v) ((x)->dim2Edge2 = (v)) +#define XAI_TILE3D_GET_DIM3_EDGE1(x) ((x)->dim3Edge1) +#define XAI_TILE3D_SET_DIM3_EDGE1(x, v) ((x)->dim3Edge1 = (v)) +#define XAI_TILE3D_GET_DIM3_EDGE2(x) ((x)->dim3Edge2) +#define XAI_TILE3D_SET_DIM3_EDGE2(x, v) ((x)->dim3Edge2 = (v)) + +/***************************************** +* Data type definitions +*****************************************/ +#define XAI_TYPE_IS_TILE3D(type) (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE3D_BITS)) + +#define XAI_TILE3D_U4 (XAI_U4 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_U8 (XAI_U8 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_U16 (XAI_U16 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_U32 (XAI_U32 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_U64 (XAI_U64 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_U128 (XAI_U128 | XAI_TYPE_TILE3D_BITS) + +#define XAI_TILE3D_S4 (XAI_S4 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_S8 (XAI_S8 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_S16 (XAI_S16 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_S32 (XAI_S32 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_S64 (XAI_S64 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_S128 (XAI_S128 | XAI_TYPE_TILE3D_BITS) + +#define XAI_TILE3D_F8 (XAI_F8 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_F16 (XAI_F16 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_F32 (XAI_F32 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_F64 (XAI_F64 | XAI_TYPE_TILE3D_BITS) +#define XAI_TILE3D_F128 (XAI_F128 | XAI_TYPE_TILE3D_BITS) + +/***************************************** +* 3D Frame Access Macros +*****************************************/ +#define XAI_FRAME3D_GET_BUFF_PTR XAI_FRAME_GET_BUFF_PTR +#define XAI_FRAME3D_SET_BUFF_PTR XAI_FRAME_SET_BUFF_PTR + +#define XAI_FRAME3D_GET_BUFF_SIZE XAI_FRAME_GET_BUFF_SIZE +#define XAI_FRAME3D_SET_BUFF_SIZE XAI_FRAME_SET_BUFF_SIZE + +#define XAI_FRAME3D_GET_DATA_PTR XAI_FRAME_GET_DATA_PTR +#define XAI_FRAME3D_SET_DATA_PTR XAI_FRAME_SET_DATA_PTR + +#define XAI_FRAME3D_GET_PIXEL_RES XAI_FRAME_GET_PIXEL_RES +#define XAI_FRAME3D_SET_PIXEL_RES XAI_FRAME_SET_PIXEL_RES + +#define XAI_FRAME3D_GET_PIXEL_FORMAT XAI_FRAME_GET_PIXEL_FORMAT +#define XAI_FRAME3D_SET_PIXEL_FORMAT XAI_FRAME_SET_PIXEL_FORMAT + +#define XAI_FRAME3D_GET_PADDING_TYPE XAI_FRAME_GET_PADDING_TYPE +#define XAI_FRAME3D_SET_PADDING_TYPE XAI_FRAME_SET_PADDING_TYPE + +/***************************************** +* 3D Tile Access Macros +*****************************************/ +#define XAI_TILE3D_GET_BUFF_PTR XAI_TILE2D_GET_BUFF_PTR +#define XAI_TILE3D_SET_BUFF_PTR XAI_TILE2D_SET_BUFF_PTR +#define XAI_TILE3D_SET_BUFF_PTR_COEFF XAI_TILE2D_SET_BUFF_PTR_COEFF + +#define XAI_TILE3D_GET_BUFF_SIZE XAI_TILE2D_GET_BUFF_SIZE +#define XAI_TILE3D_SET_BUFF_SIZE XAI_TILE2D_SET_BUFF_SIZE + +#define XAI_TILE3D_GET_DATA_PTR XAI_TILE2D_GET_DATA_PTR +#define XAI_TILE3D_SET_DATA_PTR XAI_TILE2D_SET_DATA_PTR +#define XAI_TILE3D_SET_DATA_PTR_COEFF XAI_TILE2D_SET_DATA_PTR_COEFF + +#define XAI_TILE3D_GET_STATUS_FLAGS XAI_TILE2D_GET_STATUS_FLAGS +#define XAI_TILE3D_SET_STATUS_FLAGS XAI_TILE2D_SET_STATUS_FLAGS + +#define XAI_TILE3D_GET_TYPE XAI_TILE2D_GET_TYPE +#define XAI_TILE3D_SET_TYPE XAI_TILE2D_SET_TYPE + +#define XAI_TILE3D_GET_ELEMENT_TYPE XAI_TILE2D_GET_ELEMENT_TYPE +#define XAI_TILE3D_GET_ELEMENT_SIZE XAI_TILE2D_GET_ELEMENT_SIZE +#define XAI_TILE3D_IS_TILE XAI_TILE2D_IS_TILE2D + +#define XAI_TILE3D_GET_FRAME_PTR(pTile3D) ((pTile3D)->pFrame) +#define XAI_TILE3D_SET_FRAME_PTR(pTile3D, ptrFrame) (pTile3D)->pFrame = ((xai_pFrame3D) (ptrFrame)) + +#define XAI_TILE3D_CHECK_STATUS_FLAGS_DMA_ONGOING XAI_TILE2D_CHECK_STATUS_FLAGS_DMA_ONGOING + +/*********************************** +* Other Marcos +***********************************/ +#define XAI_TILE3D_CHECK_VIRTUAL_FRAME XAI_TILE2D_CHECK_VIRTUAL_FRAME +#define XAI_FRAME3D_CHECK_VIRTUAL_FRAME XAI_FRAME_CHECK_VIRTUAL_FRAME + +typedef enum +{ + XAI_TILE_UNALIGNED, + XAI_EDGE_ALIGNED_32, + XAI_DATA_ALIGNED_32, + XAI_EDGE_ALIGNED_64, + XAI_DATA_ALIGNED_64, + EDGE_ALIGNED_128, + DATA_ALIGNED_128, +} xai_buffer_align_type_t; + +// Only Q8, 240 and 341 uses alignment = 127. for P6,P1 and Q7 like dsps alignment = 127 is not supported +#define XAI_SETUP_TILE3D(type, pTile, pBuf, pFrame, bufSize, dim1Size, dim2Size, dim3Size, dim1Pitch, dim2Pitch, \ + dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, dim1Loc, dim2Loc, dim3Loc, dataOrder, \ + alignType) \ + { \ + XAI_TILE3D_SET_TYPE(pTile, type); \ + XAI_TILE3D_SET_FRAME_PTR(pTile, pFrame); \ + XAI_TILE3D_SET_BUFF_PTR(pTile, pBuf); \ + XAI_TILE3D_SET_BUFF_SIZE(pTile, bufSize); \ + XAI_TILE3D_SET_DIM1(pTile, dim1Size); \ + XAI_TILE3D_SET_DIM2(pTile, dim2Size); \ + XAI_TILE3D_SET_DIM3(pTile, dim3Size); \ + XAI_TILE3D_SET_DIM1_PITCH(pTile, dim1Pitch); \ + XAI_TILE3D_SET_DIM2_PITCH(pTile, dim2Pitch); \ + uint8_t *edgePtr = (uint8_t *) pBuf, *dataPtr; \ + int32_t alignment = 127; \ + if ((alignType == XAI_EDGE_ALIGNED_64) || (alignType == XAI_DATA_ALIGNED_64)) { alignment = 63; } \ + if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_32)) { alignment = 31; } \ + if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_EDGE_ALIGNED_64) || (alignType == EDGE_ALIGNED_128)) \ + { \ + edgePtr = (uint8_t *) (((uintptr_t) (pBuf) + alignment) & (~alignment)); \ + } \ + XAI_TILE3D_SET_DATA_PTR(pTile, edgePtr + ((dim3Edge1) * (dim2Pitch) + \ + (dim2Edge1) * (dim1Pitch) + (dim1Edge1)) * XAI_TILE3D_GET_ELEMENT_SIZE(pTile)); \ + if ((alignType == XAI_DATA_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_64) || (alignType == DATA_ALIGNED_128)) \ + { \ + dataPtr = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(pTile); \ + dataPtr = (uint8_t *) (((uintptr_t) (dataPtr) + alignment) & (~alignment)); \ + XAI_TILE3D_SET_DATA_PTR(pTile, dataPtr); \ + } \ + XAI_TILE3D_SET_DIM1_EDGE1(pTile, dim1Edge1); \ + XAI_TILE3D_SET_DIM1_EDGE2(pTile, dim1Edge2); \ + XAI_TILE3D_SET_DIM2_EDGE1(pTile, dim2Edge1); \ + XAI_TILE3D_SET_DIM2_EDGE2(pTile, dim2Edge2); \ + XAI_TILE3D_SET_DIM3_EDGE1(pTile, dim3Edge1); \ + XAI_TILE3D_SET_DIM3_EDGE2(pTile, dim3Edge2); \ + XAI_TILE3D_SET_DIM1_COORD(pTile, dim1Loc); \ + XAI_TILE3D_SET_DIM2_COORD(pTile, dim2Loc); \ + XAI_TILE3D_SET_DIM3_COORD(pTile, dim3Loc); \ + XAI_TILE3D_SET_DATA_ORDER(pTile, dataOrder); \ + } + +#define XAI_SETUP_FRAME3D(pFrame, pFrameBuffer, bufSize, dim1Size, dim2Size, dim3Size, dim1Pitch, dim2Pitch, \ + dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, pixRes, pixPackFormat, paddingType, \ + dataOrder) \ + { \ + XAI_FRAME3D_SET_BUFF_PTR(pFrame, pFrameBuffer); \ + XAI_FRAME3D_SET_BUFF_SIZE(pFrame, bufSize); \ + XAI_FRAME3D_SET_DIM1(pFrame, dim1Size); \ + XAI_FRAME3D_SET_DIM2(pFrame, dim2Size); \ + XAI_FRAME3D_SET_DIM3(pFrame, dim3Size); \ + XAI_FRAME3D_SET_DIM1_PITCH(pFrame, dim1Pitch); \ + XAI_FRAME3D_SET_DIM2_PITCH(pFrame, dim2Pitch); \ + XAI_FRAME3D_SET_DATA_PTR(pFrame, pFrameBuffer + ((dim3Edge1) * (dim2Pitch) + \ + (dim2Edge1) * (dim1Pitch) + (dim1Edge1)) * pixRes); \ + XAI_FRAME3D_SET_DIM1_EDGE1(pFrame, dim1Edge1); \ + XAI_FRAME3D_SET_DIM1_EDGE2(pFrame, dim1Edge2); \ + XAI_FRAME3D_SET_DIM2_EDGE1(pFrame, dim2Edge1); \ + XAI_FRAME3D_SET_DIM2_EDGE2(pFrame, dim2Edge2); \ + XAI_FRAME3D_SET_DIM3_EDGE1(pFrame, dim3Edge1); \ + XAI_FRAME3D_SET_DIM3_EDGE2(pFrame, dim3Edge2); \ + XAI_FRAME3D_SET_PIXEL_RES(pFrame, pixRes); \ + XAI_FRAME3D_SET_PIXEL_FORMAT(pFrame, pixPackFormat); \ + XAI_FRAME3D_SET_PADDING_TYPE(pFrame, paddingType); \ + XAI_FRAME3D_SET_DATA_ORDER(pFrame, dataOrder); \ + } + +#define XAI_COPY_FRAME3D_TO_TILE3D(frame, tile) { \ + XAI_TILE3D_SET_DIM1(tile, XAI_FRAME3D_GET_DIM1(frame)); \ + XAI_TILE3D_SET_DIM1_PITCH(tile, XAI_FRAME3D_GET_DIM1_PITCH(frame)); \ + XAI_TILE3D_SET_DIM1_EDGE1(tile, XAI_FRAME3D_GET_DIM1_EDGE1(frame)); \ + XAI_TILE3D_SET_DIM1_EDGE2(tile, XAI_FRAME3D_GET_DIM1_EDGE2(frame)); \ + XAI_TILE3D_SET_DIM2(tile, XAI_FRAME3D_GET_DIM2(frame)); \ + XAI_TILE3D_SET_DIM2_PITCH(tile, XAI_FRAME3D_GET_DIM2_PITCH(frame)); \ + XAI_TILE3D_SET_DIM2_EDGE1(tile, XAI_FRAME3D_GET_DIM2_EDGE1(frame)); \ + XAI_TILE3D_SET_DIM2_EDGE2(tile, XAI_FRAME3D_GET_DIM2_EDGE2(frame)); \ + XAI_TILE3D_SET_DIM3(tile, XAI_FRAME3D_GET_DIM3(frame)); \ + XAI_TILE3D_SET_DIM3_EDGE1(tile, XAI_FRAME3D_GET_DIM3_EDGE1(frame)); \ + XAI_TILE3D_SET_DIM3_EDGE2(tile, XAI_FRAME3D_GET_DIM3_EDGE2(frame)); \ + XAI_TILE3D_SET_DATA_PTR(tile, XAI_FRAME3D_GET_DATA_PTR(frame)); \ + XAI_TILE3D_SET_DATA_ORDER(tile, XAI_FRAME3D_GET_DATA_ORDER(frame)); \ +} + +#define XAI_COPY_FRAME3D_TO_FRAME3D(frameIn, frameOut) { \ + XAI_FRAME3D_SET_DIM1(frameOut, XAI_FRAME3D_GET_DIM1(frameIn)); \ + XAI_FRAME3D_SET_DIM1_PITCH(frameOut, XAI_FRAME3D_GET_DIM1_PITCH(frameIn)); \ + XAI_FRAME3D_SET_DIM1_EDGE1(frameOut, XAI_FRAME3D_GET_DIM1_EDGE1(frameIn)); \ + XAI_FRAME3D_SET_DIM1_EDGE2(frameOut, XAI_FRAME3D_GET_DIM1_EDGE2(frameIn)); \ + XAI_FRAME3D_SET_DIM2(frameOut, XAI_FRAME3D_GET_DIM2(frameIn)); \ + XAI_FRAME3D_SET_DIM2_PITCH(frameOut, XAI_FRAME3D_GET_DIM2_PITCH(frameIn)); \ + XAI_FRAME3D_SET_DIM2_EDGE1(frameOut, XAI_FRAME3D_GET_DIM2_EDGE1(frameIn)); \ + XAI_FRAME3D_SET_DIM2_EDGE2(frameOut, XAI_FRAME3D_GET_DIM2_EDGE2(frameIn)); \ + XAI_FRAME3D_SET_DIM3(frameOut, XAI_FRAME3D_GET_DIM3(frameIn)); \ + XAI_FRAME3D_SET_DIM3_EDGE1(frameOut, XAI_FRAME3D_GET_DIM2_EDGE1(frameIn)); \ + XAI_FRAME3D_SET_DIM3_EDGE2(frameOut, XAI_FRAME3D_GET_DIM2_EDGE2(frameIn)); \ + XAI_FRAME3D_SET_DATA_PTR(frameOut, XAI_FRAME3D_GET_DATA_PTR(frameIn)); \ + XAI_FRAME3D_SET_DATA_ORDER(frameOut, XAI_FRAME3D_GET_DATA_ORDER(frameIn)); \ + XAI_FRAME3D_SET_PIXEL_RES(frameOut, XAI_FRAME3D_GET_PIXEL_RES(frameIn)); \ +} + +#define XAI_COPY_TILE3D_TO_TILE3D(tileIn, tileOut) { \ + XAI_TILE3D_SET_DIM1(tileOut, XAI_TILE3D_GET_DIM1(tileIn)); \ + XAI_TILE3D_SET_DIM1_PITCH(tileOut, XAI_TILE3D_GET_DIM1_PITCH(tileIn)); \ + XAI_TILE3D_SET_DIM1_EDGE1(tileOut, XAI_TILE3D_GET_DIM1_EDGE1(tileIn)); \ + XAI_TILE3D_SET_DIM1_EDGE2(tileOut, XAI_TILE3D_GET_DIM1_EDGE2(tileIn)); \ + XAI_TILE3D_SET_DIM2(tileOut, XAI_TILE3D_GET_DIM2(tileIn)); \ + XAI_TILE3D_SET_DIM2_PITCH(tileOut, XAI_TILE3D_GET_DIM2_PITCH(tileIn)); \ + XAI_TILE3D_SET_DIM2_EDGE1(tileOut, XAI_TILE3D_GET_DIM2_EDGE1(tileIn)); \ + XAI_TILE3D_SET_DIM2_EDGE2(tileOut, XAI_TILE3D_GET_DIM2_EDGE2(tileIn)); \ + XAI_TILE3D_SET_DIM3(tileOut, XAI_TILE3D_GET_DIM3(tileIn)); \ + XAI_TILE3D_SET_DIM3_EDGE1(tileOut, XAI_TILE3D_GET_DIM3_EDGE1(tileIn)); \ + XAI_TILE3D_SET_DIM3_EDGE2(tileOut, XAI_TILE3D_GET_DIM3_EDGE2(tileIn)); \ + XAI_TILE3D_SET_DATA_PTR(tileOut, XAI_TILE3D_GET_DATA_PTR(tileIn)); \ + XAI_TILE3D_SET_DATA_ORDER(tileOut, XAI_TILE3D_GET_DATA_ORDER(tileIn)); \ +} + +// Assumes 8 bit pixRes and Edge1 = Edge2 +#define XAI_TILE3D_UPDATE_EDGE_DIM1(pTile, newEdgeSize) \ + { \ + uint16_t currEdgeSize = (uint16_t) XAI_TILE3D_GET_DIM1_EDGE1(pTile); \ + uint32_t dim1Pitch = (uint32_t) XAI_TILE3D_GET_DIM1_PITCH(pTile); \ + uintptr_t dataU32 = (uintptr_t) XAI_TILE3D_GET_DATA_PTR(pTile); \ + dataU32 = dataU32 + newEdgeSize - currEdgeSize; \ + XAI_TILE3D_SET_DATA_PTR(pTile, (void *) dataU32); \ + XAI_TILE3D_SET_DIM1_EDGE1(pTile, newEdgeSize); \ + XAI_TILE3D_SET_DIM1_EDGE2(pTile, newEdgeSize); \ + XAI_TILE3D_SET_DIM1(pTile, dim1Pitch - 2 * newEdgeSize); \ + } + +// Assumes 8 bit pixRes and Edge1 = Edge2 +#define XAI_TILE3D_UPDATE_EDGE_DIM2(pTile, newEdgeSize) \ + { \ + uint16_t currEdgeSize = (uint16_t) XAI_TILE3D_GET_DIM2_EDGE1(pTile); \ + uint32_t dim1Pitch = (uint32_t) XAI_TILE3D_GET_DIM1_PITCH(pTile); \ + uint16_t dim2Size = (uint16_t) XAI_TILE3D_GET_DIM2(pTile); \ + uintptr_t dataU32 = (uintptr_t) XAI_TILE3D_GET_DATA_PTR(pTile); \ + dataU32 = dataU32 + dim1Pitch * (newEdgeSize - currEdgeSize); \ + XAI_TILE3D_SET_DATA_PTR(pTile, (void *) dataU32); \ + XAI_TILE3D_SET_DIM2_EDGE1(pTile, newEdgeSize); \ + XAI_TILE3D_SET_DIM2_EDGE2(pTile, newEdgeSize); \ + XAI_TILE3D_SET_DIM2(pTile, dim2Size + 2 * (currEdgeSize - newEdgeSize)); \ + } + +// Assumes 8 bit pixRes and Edge1 = Edge2 +#define XAI_TILE3D_UPDATE_EDGE_DIM3(pTile, newEdgeSize) \ + { \ + uint16_t currEdgeSize = (uint16_t) XAI_TILE3D_GET_DIM3_EDGE1(pTile); \ + uint32_t dim2Pitch = (uint32_t) XAI_TILE3D_GET_DIM2_PITCH(pTile); \ + uint16_t dim3Size = (uint16_t) XAI_TILE3D_GET_DIM3(pTile); \ + uintptr_t dataU32 = (uintptr_t) XAI_TILE3D_GET_DATA_PTR(pTile); \ + dataU32 = dataU32 + dim2Pitch * (newEdgeSize - currEdgeSize); \ + XAI_TILE3D_SET_DATA_PTR(pTile, (void *) dataU32); \ + XAI_TILE3D_SET_DIM3_EDGE1(pTile, newEdgeSize); \ + XAI_TILE3D_SET_DIM3_EDGE2(pTile, newEdgeSize); \ + XAI_TILE3D_SET_DIM3(pTile, dim3Size + 2 * (currEdgeSize - newEdgeSize)); \ + } + +#define XAI_TILE3D_UPDATE_DIMENSIONS(pTile, dim1Loc, dim2Loc, dim3Loc, dim1Size, dim2Size, dim3Size, \ + dim1Pitch, dim2Pitch) \ + { \ + XAI_TILE3D_SET_DIM1_COORD(pTile, dim1Loc); \ + XAI_TILE3D_SET_DIM2_COORD(pTile, dim2Loc); \ + XAI_TILE3D_SET_DIM3_COORD(pTile, dim3Loc); \ + XAI_TILE3D_SET_DIM1(pTile, dim1Size); \ + XAI_TILE3D_SET_DIM2(pTile, dim2Size); \ + XAI_TILE3D_SET_DIM3(pTile, dim3Size); \ + XAI_TILE3D_SET_DIM1_PITCH(pTile, dim1Pitch); \ + XAI_TILE3D_SET_DIM2_PITCH(pTile, dim2Pitch); \ + } + +/****************************************************************************************************************** +* +* 4D definitions - extension of 3D definitions +* +* ****************************************************************************************************************/ +typedef struct xai_frame4DStruct +{ + void *pFrameBuff; + uint32_t frameBuffSize; + void *pFrameData; + int32_t dim1Size; + int32_t dim2Size; + int32_t dim1Pitch; // pitch in width dimension + uint8_t pixelRes; // in bits + uint8_t pixelPackFormat; + uint16_t dim1Edge1; + uint16_t dim1Edge2; + uint16_t dim2Edge1; + uint16_t dim2Edge2; + uint16_t dim3Edge1; + uint16_t dim3Edge2; + uint8_t paddingType; + // new fields + int32_t dim2Pitch; + int32_t dim3Size; + xai_cnn_data_order dataOrder; // WHD, DWH, WHDN, NWHD, etc. + // new fields + int32_t dim3Pitch; + int32_t dim4Size; +} xai_frame4D, *xai_pFrame4D; + +// new access macros +#define XAI_FRAME4D_GET_DIM1 XAI_FRAME3D_GET_DIM1 +#define XAI_FRAME4D_SET_DIM1 XAI_FRAME3D_SET_DIM1 +#define XAI_FRAME4D_GET_DIM1_PITCH XAI_FRAME3D_GET_DIM1_PITCH +#define XAI_FRAME4D_SET_DIM1_PITCH XAI_FRAME3D_SET_DIM1_PITCH +#define XAI_FRAME4D_GET_DIM1_PITCH_IN_BYTES XAI_FRAME3D_GET_DIM1_PITCH_IN_BYTES +#define XAI_FRAME4D_GET_DIM2 XAI_FRAME3D_GET_DIM2 +#define XAI_FRAME4D_SET_DIM2 XAI_FRAME3D_SET_DIM2 +#define XAI_FRAME4D_GET_DIM2_PITCH XAI_FRAME3D_GET_DIM2_PITCH +#define XAI_FRAME4D_SET_DIM2_PITCH XAI_FRAME3D_SET_DIM2_PITCH +#define XAI_FRAME4D_GET_DIM2_PITCH_IN_BYTES XAI_FRAME3D_GET_DIM2_PITCH_IN_BYTES +#define XAI_FRAME4D_GET_DIM3 XAI_FRAME3D_GET_DIM3 +#define XAI_FRAME4D_SET_DIM3 XAI_FRAME3D_SET_DIM3 +#define XAI_FRAME4D_GET_DATA_ORDER XAI_FRAME3D_GET_DATA_ORDER +#define XAI_FRAME4D_SET_DATA_ORDER XAI_FRAME3D_SET_DATA_ORDER +#define XAI_FRAME4D_GET_DIM1_EDGE1 XAI_FRAME3D_GET_DIM1_EDGE1 +#define XAI_FRAME4D_SET_DIM1_EDGE1 XAI_FRAME3D_SET_DIM1_EDGE1 +#define XAI_FRAME4D_GET_DIM1_EDGE2 XAI_FRAME3D_GET_DIM1_EDGE2 +#define XAI_FRAME4D_SET_DIM1_EDGE2 XAI_FRAME3D_SET_DIM1_EDGE2 +#define XAI_FRAME4D_GET_DIM2_EDGE1 XAI_FRAME3D_GET_DIM2_EDGE1 +#define XAI_FRAME4D_SET_DIM2_EDGE1 XAI_FRAME3D_SET_DIM2_EDGE1 +#define XAI_FRAME4D_GET_DIM2_EDGE2 XAI_FRAME3D_GET_DIM2_EDGE2 +#define XAI_FRAME4D_SET_DIM2_EDGE2 XAI_FRAME3D_SET_DIM2_EDGE2 +#define XAI_FRAME4D_GET_DIM4(x) ((x)->dim4Size) +#define XAI_FRAME4D_SET_DIM4(x, v) ((x)->dim4Size = (v)) +#define XAI_FRAME4D_GET_DIM3_PITCH(x) ((x)->dim3Pitch) +#define XAI_FRAME4D_SET_DIM3_PITCH(x, v) ((x)->dim3Pitch = (v)) +#define XAI_FRAME4D_GET_DIM3_EDGE1(x) ((x)->dim3Edge1) +#define XAI_FRAME4D_SET_DIM3_EDGE1(x, v) ((x)->dim3Edge1 = (v)) +#define XAI_FRAME4D_GET_DIM3_EDGE2(x) ((x)->dim3Edge2) +#define XAI_FRAME4D_SET_DIM3_EDGE2(x, v) ((x)->dim3Edge2 = (v)) + +// 4D tile +#define XAI_TILE4D_FIELDS \ + uint32_t bufferSize; \ + int32_t dim1Size; \ + int32_t dim1Pitch; \ + uint32_t status; /*Currently not used, planned to be obsolete*/ \ + uint16_t type; \ + int32_t dim2Size; \ + xai_frame4D *pFrame; \ + int32_t dim1Loc; /* dim1-loc of top-left active pixel in src frame */ \ + int32_t dim2Loc; /* dim2-loc of top-left active pixel in src frame */ \ + uint16_t dim1Edge1; \ + uint16_t dim2Edge1; \ + uint16_t dim1Edge2; \ + uint16_t dim2Edge2; \ + /* new fields */ \ + int32_t dim2Pitch; \ + int32_t dim3Size; \ + xai_cnn_data_order dataOrder; \ + int32_t dim3Loc; /* dim3-loc of top-left active pixel in src frame */ \ + uint16_t dim3Edge1; \ + uint16_t dim3Edge2; \ + /* new fields */ \ + int32_t dim3Pitch; \ + int32_t dim4Size; /* 4th dimension is num for lack of better term */ \ + int32_t dim4Loc; /* dim4-loc of top-left active pixel in src frame */ \ + /* Number of PTILES in a MEMTILE along a particular dimension. Used for MEMTILES only */ \ + int16_t numPtilesDim1; \ + int16_t numPtilesDim2; \ + int16_t numPtilesDim3; + +typedef struct xai_tile4DStruct +{ + void *pBuffer; + void *pData; + XAI_TILE4D_FIELDS +#ifdef GLOW_BUILD + int8_t printFlag; + const char *nodeName; + const char *fileName; +#endif // GLOW_BUILD +} xai_tile4D, *xai_pTile4D; + +typedef struct xai_tile4DStruct_64 +{ + uint64_t pBuffer; + uint64_t pData; + XAI_TILE4D_FIELDS +#ifdef GLOW_BUILD + int8_t printFlag; + const char *nodeName; + const char *fileName; +#endif // GLOW_BUILD +} xai_tile4D_64, *xai_pTile4D_64; + +#define XAI_TILE4D_GET_DIM1 XAI_TILE3D_GET_DIM1 +#define XAI_TILE4D_SET_DIM1 XAI_TILE3D_SET_DIM1 +#define XAI_TILE4D_GET_DIM1_PITCH XAI_TILE3D_GET_DIM1_PITCH +#define XAI_TILE4D_SET_DIM1_PITCH XAI_TILE3D_SET_DIM1_PITCH +#define XAI_TILE4D_GET_DIM2 XAI_TILE3D_GET_DIM2 +#define XAI_TILE4D_SET_DIM2 XAI_TILE3D_SET_DIM2 +#define XAI_TILE4D_GET_DIM2_PITCH XAI_TILE3D_GET_DIM2_PITCH +#define XAI_TILE4D_SET_DIM2_PITCH XAI_TILE3D_SET_DIM2_PITCH +#define XAI_TILE4D_GET_DIM3 XAI_TILE3D_GET_DIM3 +#define XAI_TILE4D_SET_DIM3 XAI_TILE3D_SET_DIM3 +#define XAI_TILE4D_GET_DIM3_PITCH(x) ((x)->dim3Pitch) +#define XAI_TILE4D_SET_DIM3_PITCH(x, v) ((x)->dim3Pitch = (v)) +#define XAI_TILE4D_GET_DIM4(x) ((x)->dim4Size) +#define XAI_TILE4D_SET_DIM4(x, v) ((x)->dim4Size = (v)) +#define XAI_TILE4D_GET_DIM1_EDGE1 XAI_TILE3D_GET_DIM1_EDGE1 +#define XAI_TILE4D_SET_DIM1_EDGE1 XAI_TILE3D_SET_DIM1_EDGE1 +#define XAI_TILE4D_GET_DIM1_EDGE2 XAI_TILE3D_GET_DIM1_EDGE2 +#define XAI_TILE4D_SET_DIM1_EDGE2 XAI_TILE3D_SET_DIM1_EDGE2 +#define XAI_TILE4D_GET_DIM2_EDGE1 XAI_TILE3D_GET_DIM2_EDGE1 +#define XAI_TILE4D_SET_DIM2_EDGE1 XAI_TILE3D_SET_DIM2_EDGE1 +#define XAI_TILE4D_GET_DIM2_EDGE2 XAI_TILE3D_GET_DIM2_EDGE2 +#define XAI_TILE4D_SET_DIM2_EDGE2 XAI_TILE3D_SET_DIM2_EDGE2 +#define XAI_TILE4D_GET_DIM3_EDGE1 XAI_TILE3D_GET_DIM3_EDGE1 +#define XAI_TILE4D_SET_DIM3_EDGE1 XAI_TILE3D_SET_DIM3_EDGE1 +#define XAI_TILE4D_GET_DIM3_EDGE2 XAI_TILE3D_GET_DIM3_EDGE2 +#define XAI_TILE4D_SET_DIM3_EDGE2 XAI_TILE3D_SET_DIM3_EDGE2 +#define XAI_TILE4D_GET_DATA_ORDER XAI_TILE3D_GET_DATA_ORDER +#define XAI_TILE4D_SET_DATA_ORDER XAI_TILE3D_SET_DATA_ORDER +#define XAI_TILE4D_GET_DIM1_COORD XAI_TILE3D_GET_DIM1_COORD +#define XAI_TILE4D_SET_DIM1_COORD XAI_TILE3D_SET_DIM1_COORD +#define XAI_TILE4D_GET_DIM2_COORD XAI_TILE3D_GET_DIM2_COORD +#define XAI_TILE4D_SET_DIM2_COORD XAI_TILE3D_SET_DIM2_COORD +#define XAI_TILE4D_GET_DIM3_COORD XAI_TILE3D_GET_DIM3_COORD +#define XAI_TILE4D_SET_DIM3_COORD XAI_TILE3D_SET_DIM3_COORD +#define XAI_TILE4D_GET_DIM4_COORD(x) ((x)->dim4Loc) +#define XAI_TILE4D_SET_DIM4_COORD(x, v) ((x)->dim4Loc = (v)) +#ifdef GLOW_BUILD +#define XAI_TILE4D_GET_PRINT_FLAG(x) ((x)->printFlag) +#define XAI_TILE4D_SET_PRINT_FLAG(x, v) ((x)->printFlag = (v)) +#define XAI_TILE4D_GET_NODE_NAME(x) ((x)->nodeName) +#define XAI_TILE4D_SET_NODE_NAME(x, v) ((x)->nodeName = (v)) +#define XAI_TILE4D_GET_FILE_NAME(x) ((x)->fileName) +#define XAI_TILE4D_SET_FILE_NAME(x, v) ((x)->fileName = (v)) +#endif + +/***************************************** +* Data type definitions +*****************************************/ +#define XAI_TYPE_IS_TILE4D(type) (!(((type) & (XAI_TYPE_TILE_MASK)) ^ XAI_TYPE_TILE4D_BITS)) + +#define XAI_TILE4D_U4 (XAI_U4 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_U8 (XAI_U8 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_U16 (XAI_U16 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_U32 (XAI_U32 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_U64 (XAI_U64 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_U128 (XAI_U128 | XAI_TYPE_TILE4D_BITS) + +#define XAI_TILE4D_S4 (XAI_S8 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_S8 (XAI_S8 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_S16 (XAI_S16 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_S32 (XAI_S32 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_S64 (XAI_S64 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_S128 (XAI_S128 | XAI_TYPE_TILE4D_BITS) + +#define XAI_TILE4D_F8 (XAI_F8 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_F16 (XAI_F16 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_F32 (XAI_F32 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_F64 (XAI_F64 | XAI_TYPE_TILE4D_BITS) +#define XAI_TILE4D_F128 (XAI_F128 | XAI_TYPE_TILE4D_BITS) + +/***************************************** +* 4D Frame Access Macros +*****************************************/ +#define XAI_FRAME4D_GET_BUFF_PTR XAI_FRAME_GET_BUFF_PTR +#define XAI_FRAME4D_SET_BUFF_PTR XAI_FRAME_SET_BUFF_PTR + +#define XAI_FRAME4D_GET_BUFF_SIZE XAI_FRAME_GET_BUFF_SIZE +#define XAI_FRAME4D_SET_BUFF_SIZE XAI_FRAME_SET_BUFF_SIZE + +#define XAI_FRAME4D_GET_DATA_PTR XAI_FRAME_GET_DATA_PTR +#define XAI_FRAME4D_SET_DATA_PTR XAI_FRAME_SET_DATA_PTR + +#define XAI_FRAME4D_GET_PIXEL_RES XAI_FRAME_GET_PIXEL_RES +#define XAI_FRAME4D_SET_PIXEL_RES XAI_FRAME_SET_PIXEL_RES + +#define XAI_FRAME4D_GET_PIXEL_FORMAT XAI_FRAME_GET_PIXEL_FORMAT +#define XAI_FRAME4D_SET_PIXEL_FORMAT XAI_FRAME_SET_PIXEL_FORMAT + +#define XAI_FRAME4D_GET_PADDING_TYPE XAI_FRAME_GET_PADDING_TYPE +#define XAI_FRAME4D_SET_PADDING_TYPE XAI_FRAME_SET_PADDING_TYPE + +/***************************************** +* 4D Tile Access Macros +*****************************************/ +#define XAI_TILE4D_GET_BUFF_PTR XAI_TILE2D_GET_BUFF_PTR +#define XAI_TILE4D_SET_BUFF_PTR XAI_TILE2D_SET_BUFF_PTR +#define XAI_TILE4D_SET_BUFF_PTR_COEFF XAI_TILE2D_SET_BUFF_PTR_COEFF + +#define XAI_TILE4D_GET_BUFF_SIZE XAI_TILE2D_GET_BUFF_SIZE +#define XAI_TILE4D_SET_BUFF_SIZE XAI_TILE2D_SET_BUFF_SIZE + +#define XAI_TILE4D_GET_DATA_PTR XAI_TILE2D_GET_DATA_PTR +#define XAI_TILE4D_SET_DATA_PTR XAI_TILE2D_SET_DATA_PTR +#define XAI_TILE4D_SET_DATA_PTR_COEFF XAI_TILE2D_SET_DATA_PTR_COEFF + +#define XAI_TILE4D_GET_STATUS_FLAGS XAI_TILE2D_GET_STATUS_FLAGS +#define XAI_TILE4D_SET_STATUS_FLAGS XAI_TILE2D_SET_STATUS_FLAGS + +#define XAI_TILE4D_GET_TYPE XAI_TILE2D_GET_TYPE +#define XAI_TILE4D_SET_TYPE XAI_TILE2D_SET_TYPE + +#define XAI_TILE4D_GET_ELEMENT_TYPE XAI_TILE2D_GET_ELEMENT_TYPE +#define XAI_TILE4D_GET_ELEMENT_SIZE XAI_TILE2D_GET_ELEMENT_SIZE +#define XAI_TILE4D_IS_TILE XAI_TILE2D_IS_TILE2D + +#define XAI_TILE4D_GET_FRAME_PTR(pTile4D) ((pTile4D)->pFrame) +#define XAI_TILE4D_SET_FRAME_PTR(pTile4D, ptrFrame) (pTile4D)->pFrame = ((xai_pFrame4D) (ptrFrame)) + +#define XAI_TILE4D_CHECK_STATUS_FLAGS_DMA_ONGOING XAI_TILE2D_CHECK_STATUS_FLAGS_DMA_ONGOING +#define XAI_TILE4D_CHECK_STATUS_FLAGS_EDGE_PADDING_NEEDED XAI_TILE2D_CHECK_STATUS_FLAGS_EDGE_PADDING_NEEDED + +/*********************************** +* Other Marcos +***********************************/ +#define XAI_TILE4D_CHECK_VIRTUAL_FRAME XAI_TILE2D_CHECK_VIRTUAL_FRAME +#define XAI_FRAME4D_CHECK_VIRTUAL_FRAME XAI_FRAME_CHECK_VIRTUAL_FRAME + +// Only Q8, 240 and 341 uses alignment = 127. for P6,P1 and Q7 like dsps alignment = 127 is not supported +#define XAI_SETUP_TILE4D(type, pTile, pBuf, pFrame, bufSize, dim1Size, dim2Size, dim3Size, dim4Size, dim1Pitch, dim2Pitch, \ + dim3Pitch, dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, \ + dim1Loc, dim2Loc, dim3Loc, dim4Loc, dataOrder, alignType) \ + { \ + XAI_TILE4D_SET_TYPE(pTile, type); \ + XAI_TILE4D_SET_FRAME_PTR(pTile, pFrame); \ + XAI_TILE4D_SET_BUFF_PTR(pTile, pBuf); \ + XAI_TILE4D_SET_BUFF_SIZE(pTile, bufSize); \ + XAI_TILE4D_SET_DIM1(pTile, dim1Size); \ + XAI_TILE4D_SET_DIM2(pTile, dim2Size); \ + XAI_TILE4D_SET_DIM3(pTile, dim3Size); \ + XAI_TILE4D_SET_DIM4(pTile, dim4Size); \ + XAI_TILE4D_SET_DIM1_PITCH(pTile, dim1Pitch); \ + XAI_TILE4D_SET_DIM2_PITCH(pTile, dim2Pitch); \ + XAI_TILE4D_SET_DIM3_PITCH(pTile, dim3Pitch); \ + uint8_t *edgePtr = (uint8_t *) pBuf, *dataPtr; \ + int32_t alignment = 127; \ + if ((alignType == XAI_EDGE_ALIGNED_64) || (alignType == XAI_DATA_ALIGNED_64)) { alignment = 63; } \ + if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_32)) { alignment = 31; } \ + if ((alignType == XAI_EDGE_ALIGNED_32) || (alignType == XAI_EDGE_ALIGNED_64) || (alignType == EDGE_ALIGNED_128)) \ + { \ + edgePtr = (uint8_t *) (((uintptr_t) (pBuf) + alignment) & (~alignment)); \ + } \ + XAI_TILE4D_SET_DATA_PTR(pTile, edgePtr + ((dim3Edge1) * (dim2Pitch) + (dim2Edge1) * (dim1Pitch) + (dim1Edge1)) \ + * XAI_TILE4D_GET_ELEMENT_SIZE(pTile)); \ + if ((alignType == XAI_DATA_ALIGNED_32) || (alignType == XAI_DATA_ALIGNED_64) || (alignType == DATA_ALIGNED_128)) \ + { \ + dataPtr = (uint8_t *) XAI_TILE4D_GET_DATA_PTR(pTile); \ + dataPtr = (uint8_t *) (((uintptr_t) (dataPtr) + alignment) & (~alignment)); \ + XAI_TILE4D_SET_DATA_PTR(pTile, dataPtr); \ + } \ + XAI_TILE4D_SET_DIM1_EDGE1(pTile, dim1Edge1); \ + XAI_TILE4D_SET_DIM1_EDGE2(pTile, dim1Edge2); \ + XAI_TILE4D_SET_DIM2_EDGE1(pTile, dim2Edge1); \ + XAI_TILE4D_SET_DIM2_EDGE2(pTile, dim2Edge2); \ + XAI_TILE4D_SET_DIM3_EDGE1(pTile, dim3Edge1); \ + XAI_TILE4D_SET_DIM3_EDGE2(pTile, dim3Edge2); \ + XAI_TILE4D_SET_DIM1_COORD(pTile, dim1Loc); \ + XAI_TILE4D_SET_DIM2_COORD(pTile, dim2Loc); \ + XAI_TILE4D_SET_DIM3_COORD(pTile, dim3Loc); \ + XAI_TILE4D_SET_DIM4_COORD(pTile, dim4Loc); \ + XAI_TILE4D_SET_DATA_ORDER(pTile, dataOrder); \ + } + +#define XAI_SETUP_FRAME4D(pFrame, pFrameBuffer, bufSize, dim1Size, dim2Size, dim3Size, dim4Size, dim1Pitch, dim2Pitch, dim3Pitch, \ + dim1Edge1, dim1Edge2, dim2Edge1, dim2Edge2, dim3Edge1, dim3Edge2, pixRes, pixPackFormat, paddingType, dataOrder) \ + { \ + XAI_FRAME4D_SET_BUFF_PTR(pFrame, pFrameBuffer); \ + XAI_FRAME4D_SET_BUFF_SIZE(pFrame, bufSize); \ + XAI_FRAME4D_SET_DIM1(pFrame, dim1Size); \ + XAI_FRAME4D_SET_DIM2(pFrame, dim2Size); \ + XAI_FRAME4D_SET_DIM3(pFrame, dim3Size); \ + XAI_FRAME4D_SET_DIM4(pFrame, dim4Size); \ + XAI_FRAME4D_SET_DIM1_PITCH(pFrame, dim1Pitch); \ + XAI_FRAME4D_SET_DIM2_PITCH(pFrame, dim2Pitch); \ + XAI_FRAME4D_SET_DIM3_PITCH(pFrame, dim3Pitch); \ + XAI_FRAME4D_SET_DATA_PTR(pFrame, pFrameBuffer + ((dim3Edge1) * (dim2Pitch) + (dim2Edge1) * (dim1Pitch) + \ + (dim1Edge1)) * pixRes); \ + XAI_FRAME4D_SET_DIM1_EDGE1(pFrame, dim1Edge1); \ + XAI_FRAME4D_SET_DIM1_EDGE2(pFrame, dim1Edge2); \ + XAI_FRAME4D_SET_DIM2_EDGE1(pFrame, dim2Edge1); \ + XAI_FRAME4D_SET_DIM2_EDGE2(pFrame, dim2Edge2); \ + XAI_FRAME4D_SET_DIM3_EDGE1(pFrame, dim3Edge1); \ + XAI_FRAME4D_SET_DIM3_EDGE2(pFrame, dim3Edge2); \ + XAI_FRAME4D_SET_PIXEL_RES(pFrame, pixRes); \ + XAI_FRAME4D_SET_PIXEL_FORMAT(pFrame, pixPackFormat); \ + XAI_FRAME4D_SET_PADDING_TYPE(pFrame, paddingType); \ + XAI_FRAME4D_SET_DATA_ORDER(pTile, dataOrder); \ + } + +#define XAI_COPY_FRAME4D_TO_TILE4D(frame, tile) { \ + XAI_TILE4D_SET_DIM1(tile, XAI_FRAME4D_GET_DIM1(frame)); \ + XAI_TILE4D_SET_DIM1_PITCH(tile, XAI_FRAME4D_GET_DIM1_PITCH(frame)); \ + XAI_TILE4D_SET_DIM1_EDGE1(tile, XAI_FRAME4D_GET_DIM1_EDGE1(frame)); \ + XAI_TILE4D_SET_DIM1_EDGE2(tile, XAI_FRAME4D_GET_DIM1_EDGE2(frame)); \ + XAI_TILE4D_SET_DIM2(tile, XAI_FRAME4D_GET_DIM2(frame)); \ + XAI_TILE4D_SET_DIM2_PITCH(tile, XAI_FRAME4D_GET_DIM2_PITCH(frame)); \ + XAI_TILE4D_SET_DIM2_EDGE1(tile, XAI_FRAME4D_GET_DIM2_EDGE1(frame)); \ + XAI_TILE4D_SET_DIM2_EDGE2(tile, XAI_FRAME4D_GET_DIM2_EDGE2(frame)); \ + XAI_TILE4D_SET_DIM3(tile, XAI_FRAME4D_GET_DIM3(frame)); \ + XAI_TILE4D_SET_DIM3_PITCH(tile, XAI_FRAME4D_GET_DIM3_PITCH(frame)); \ + XAI_TILE4D_SET_DIM3_EDGE1(tile, XAI_FRAME4D_GET_DIM3_EDGE1(frame)); \ + XAI_TILE4D_SET_DIM3_EDGE2(tile, XAI_FRAME4D_GET_DIM3_EDGE2(frame)); \ + XAI_TILE4D_SET_DIM4(tile, XAI_FRAME4D_GET_DIM4(frame)); \ + XAI_TILE4D_SET_DATA_PTR(tile, XAI_FRAME4D_GET_DATA_PTR(frame)); \ + XAI_TILE4D_SET_DATA_ORDER(tile, XAI_FRAME4D_GET_DATA_ORDER(frame)); \ +} + +#define XAI_COPY_FRAME4D_TO_FRAME4D(frameIn, frameOut) { \ + XAI_FRAME4D_SET_DIM1(frameOut, XAI_FRAME4D_GET_DIM1(frameIn)); \ + XAI_FRAME4D_SET_DIM1_PITCH(frameOut, XAI_FRAME4D_GET_DIM1_PITCH(frameIn)); \ + XAI_FRAME4D_SET_DIM1_EDGE1(frameOut, XAI_FRAME4D_GET_DIM1_EDGE1(frameIn)); \ + XAI_FRAME4D_SET_DIM1_EDGE2(frameOut, XAI_FRAME4D_GET_DIM1_EDGE2(frameIn)); \ + XAI_FRAME4D_SET_DIM2(frameOut, XAI_FRAME4D_GET_DIM2(frameIn)); \ + XAI_FRAME4D_SET_DIM2_PITCH(frameOut, XAI_FRAME4D_GET_DIM2_PITCH(frameIn)); \ + XAI_FRAME4D_SET_DIM2_EDGE1(frameOut, XAI_FRAME4D_GET_DIM2_EDGE1(frameIn)); \ + XAI_FRAME4D_SET_DIM2_EDGE2(frameOut, XAI_FRAME4D_GET_DIM2_EDGE2(frameIn)); \ + XAI_FRAME4D_SET_DIM3(frameOut, XAI_FRAME4D_GET_DIM3(frameIn)); \ + XAI_FRAME4D_SET_DIM3_PITCH(frameOut, XAI_FRAME4D_GET_DIM3_PITCH(frameIn)); \ + XAI_FRAME4D_SET_DIM3_EDGE1(frameOut, XAI_FRAME4D_GET_DIM3_EDGE1(frameIn)); \ + XAI_FRAME4D_SET_DIM3_EDGE2(frameOut, XAI_FRAME4D_GET_DIM3_EDGE2(frameIn)); \ + XAI_FRAME4D_SET_DIM4(frameOut, XAI_FRAME4D_GET_DIM4(frameIn)); \ + XAI_FRAME4D_SET_DATA_PTR(frameOut, XAI_FRAME4D_GET_DATA_PTR(frameIn)); \ + XAI_FRAME4D_SET_DATA_ORDER(frameOut, XAI_FRAME4D_GET_DATA_ORDER(frameIn)); \ + XAI_FRAME4D_SET_PIXEL_RES(frameOut, XAI_FRAME4D_GET_PIXEL_RES(frameIn)); \ +} + +#define XAI_COPY_TILE4D_TO_TILE4D(tileIn, tileOut) { \ + XAI_TILE4D_SET_DIM1(tileOut, XAI_TILE4D_GET_DIM1(tileIn)); \ + XAI_TILE4D_SET_DIM1_PITCH(tileOut, XAI_TILE4D_GET_DIM1_PITCH(tileIn)); \ + XAI_TILE4D_SET_DIM1_EDGE1(tileOut, XAI_TILE4D_GET_DIM1_EDGE1(tileIn)); \ + XAI_TILE4D_SET_DIM1_EDGE2(tileOut, XAI_TILE4D_GET_DIM1_EDGE2(tileIn)); \ + XAI_TILE4D_SET_DIM2(tileOut, XAI_TILE4D_GET_DIM2(tileIn)); \ + XAI_TILE4D_SET_DIM2_PITCH(tileOut, XAI_TILE4D_GET_DIM2_PITCH(tileIn)); \ + XAI_TILE4D_SET_DIM2_EDGE1(tileOut, XAI_TILE4D_GET_DIM2_EDGE1(tileIn)); \ + XAI_TILE4D_SET_DIM2_EDGE2(tileOut, XAI_TILE4D_GET_DIM2_EDGE2(tileIn)); \ + XAI_TILE4D_SET_DIM3(tileOut, XAI_TILE4D_GET_DIM3(tileIn)); \ + XAI_TILE4D_SET_DIM3_PITCH(tileOut, XAI_TILE4D_GET_DIM3_PITCH(tileIn)); \ + XAI_TILE4D_SET_DIM3_EDGE1(tileOut, XAI_TILE4D_GET_DIM3_EDGE1(tileIn)); \ + XAI_TILE4D_SET_DIM3_EDGE2(tileOut, XAI_TILE4D_GET_DIM3_EDGE2(tileIn)); \ + XAI_TILE4D_SET_DIM4(tileOut, XAI_TILE4D_GET_DIM4(tileIn)); \ + XAI_TILE4D_SET_DATA_PTR(tileOut, XAI_TILE4D_GET_DATA_PTR(tileIn)); \ + XAI_TILE4D_SET_DATA_ORDER(tileOut, XAI_TILE4D_GET_DATA_ORDER(tileIn)); \ +} + +// Assumes 8 bit pixRes and Edge1 = Edge2 +#define XAI_TILE4D_UPDATE_EDGE_DIM1(pTile, newEdgeSize) \ + { \ + uint16_t currEdgeSize = (uint16_t) XAI_TILE4D_GET_DIM1_EDGE1(pTile); \ + uint32_t dim1Pitch = (uint32_t) XAI_TILE4D_GET_DIM1_PITCH(pTile); \ + uintptr_t dataU32 = (uintptr_t) XAI_TILE4D_GET_DATA_PTR(pTile); \ + dataU32 = dataU32 + newEdgeSize - currEdgeSize; \ + XAI_TILE4D_SET_DATA_PTR(pTile, (void *) dataU32); \ + XAI_TILE4D_SET_DIM1_EDGE1(pTile, newEdgeSize); \ + XAI_TILE4D_SET_DIM1_EDGE2(pTile, newEdgeSize); \ + XAI_TILE4D_SET_DIM1(pTile, dim1Pitch - 2 * newEdgeSize); \ + } + +// Assumes 8 bit pixRes and Edge1 = Edge2 +#define XAI_TILE4D_UPDATE_EDGE_DIM2(pTile, newEdgeSize) \ + { \ + uint16_t currEdgeSize = (uint16_t) XAI_TILE4D_GET_DIM2_EDGE1(pTile); \ + uint32_t dim1Pitch = (uint32_t) XAI_TILE4D_GET_DIM1_PITCH(pTile); \ + uint16_t dim2Size = (uint16_t) XAI_TILE4D_GET_DIM2(pTile); \ + uintptr_t dataU32 = (uintptr_t) XAI_TILE4D_GET_DATA_PTR(pTile); \ + dataU32 = dataU32 + dim1Pitch * (newEdgeSize - currEdgeSize); \ + XAI_TILE4D_SET_DATA_PTR(pTile, (void *) dataU32); \ + XAI_TILE4D_SET_DIM2_EDGE1(pTile, newEdgeSize); \ + XAI_TILE4D_SET_DIM2_EDGE2(pTile, newEdgeSize); \ + XAI_TILE4D_SET_DIM2(pTile, dim2Size + 2 * (currEdgeSize - newEdgeSize)); \ + } + +// Assumes 8 bit pixRes and Edge1 = Edge2 +#define XAI_TILE4D_UPDATE_EDGE_DIM3(pTile, newEdgeSize) \ + { \ + uint16_t currEdgeSize = (uint16_t) XAI_TILE4D_GET_DIM3_EDGE1(pTile); \ + uint32_t dim2Pitch = (uint32_t) XAI_TILE4D_GET_DIM2_PITCH(pTile); \ + uint16_t dim3Size = (uint16_t) XAI_TILE4D_GET_DIM3(pTile); \ + uintptr_t dataU32 = (uintptr_t) XAI_TILE4D_GET_DATA_PTR(pTile); \ + dataU32 = dataU32 + dim2Pitch * (newEdgeSize - currEdgeSize); \ + XAI_TILE4D_SET_DATA_PTR(pTile, (void *) dataU32); \ + XAI_TILE4D_SET_DIM3_EDGE1(pTile, newEdgeSize); \ + XAI_TILE4D_SET_DIM3_EDGE2(pTile, newEdgeSize); \ + XAI_TILE4D_SET_DIM3(pTile, dim3Size + 2 * (currEdgeSize - newEdgeSize)); \ + } + +#define XAI_TILE4D_UPDATE_DIMENSIONS(pTile, dim1Loc, dim2Loc, dim3Loc, dim4Loc, dim1Size, dim2Size, dim3Size, dim4Size, \ + dim1Pitch, dim2Pitch, dim3Pitch) \ + { \ + XAI_TILE4D_SET_DIM1_COORD(pTile, dim1Loc); \ + XAI_TILE4D_SET_DIM2_COORD(pTile, dim2Loc); \ + XAI_TILE4D_SET_DIM3_COORD(pTile, dim3Loc); \ + XAI_TILE4D_SET_DIM4_COORD(pTile, dim4Loc); \ + XAI_TILE4D_SET_DIM1(pTile, dim1Size); \ + XAI_TILE4D_SET_DIM2(pTile, dim2Size); \ + XAI_TILE4D_SET_DIM3(pTile, dim3Size); \ + XAI_TILE4D_SET_DIM4(pTile, dim4Size); \ + XAI_TILE4D_SET_DIM1_PITCH(pTile, dim1Pitch); \ + XAI_TILE4D_SET_DIM2_PITCH(pTile, dim2Pitch); \ + XAI_TILE4D_SET_DIM3_PITCH(pTile, dim3Pitch); \ + } + +// 5D tile +#define XAI_TILE5D_FIELDS \ + uint32_t bufferSize; \ + int32_t dim1Size; \ + int32_t dim1Pitch; \ + uint16_t type; \ + int32_t dim2Size; \ + int32_t dim2Pitch; \ + int32_t dim3Size; \ + int32_t dim3Pitch; \ + int32_t dim4Size; \ + int32_t dim4Pitch; \ + int32_t dim5Size; \ + xai_cnn_data_order dataOrder; + +// 5D tile +typedef struct xai_tile5DStruct +{ + void *pBuffer; + void *pData; + XAI_TILE5D_FIELDS +} xai_tile5D, *xai_pTile5D; + +/***************************************** +* 5D Tile Access Macros +*****************************************/ +#define XAI_TILE5D_GET_BUFF_PTR XAI_TILE2D_GET_BUFF_PTR +#define XAI_TILE5D_SET_BUFF_PTR XAI_TILE2D_SET_BUFF_PTR + +#define XAI_TILE5D_GET_BUFF_SIZE XAI_TILE2D_GET_BUFF_SIZE +#define XAI_TILE5D_SET_BUFF_SIZE XAI_TILE2D_SET_BUFF_SIZE + +#define XAI_TILE5D_GET_DATA_PTR XAI_TILE2D_GET_DATA_PTR +#define XAI_TILE5D_SET_DATA_PTR XAI_TILE2D_SET_DATA_PTR + +#define XAI_TILE5D_GET_TYPE XAI_TILE2D_GET_TYPE +#define XAI_TILE5D_SET_TYPE XAI_TILE2D_SET_TYPE + +#define XAI_TILE5D_GET_DIM1 XAI_TILE4D_GET_DIM1 +#define XAI_TILE5D_SET_DIM1 XAI_TILE4D_SET_DIM1 +#define XAI_TILE5D_GET_DIM1_PITCH XAI_TILE4D_GET_DIM1_PITCH +#define XAI_TILE5D_SET_DIM1_PITCH XAI_TILE4D_SET_DIM1_PITCH +#define XAI_TILE5D_GET_DIM2 XAI_TILE4D_GET_DIM2 +#define XAI_TILE5D_SET_DIM2 XAI_TILE4D_SET_DIM2 +#define XAI_TILE5D_GET_DIM2_PITCH XAI_TILE4D_GET_DIM2_PITCH +#define XAI_TILE5D_SET_DIM2_PITCH XAI_TILE4D_SET_DIM2_PITCH +#define XAI_TILE5D_GET_DIM3 XAI_TILE4D_GET_DIM3 +#define XAI_TILE5D_SET_DIM3 XAI_TILE4D_SET_DIM3 +#define XAI_TILE5D_GET_DIM3_PITCH XAI_TILE4D_GET_DIM3_PITCH +#define XAI_TILE5D_SET_DIM3_PITCH XAI_TILE4D_SET_DIM3_PITCH +#define XAI_TILE5D_GET_DIM4 XAI_TILE4D_GET_DIM4 +#define XAI_TILE5D_SET_DIM4 XAI_TILE4D_SET_DIM4 +#define XAI_TILE5D_GET_DIM4_PITCH(x) ((x)->dim4Pitch) +#define XAI_TILE5D_SET_DIM4_PITCH(x, v) ((x)->dim4Pitch = (v)) +#define XAI_TILE5D_GET_DIM5(x) ((x)->dim5Size) +#define XAI_TILE5D_SET_DIM5(x, v) ((x)->dim5Size = (v)) +#define XAI_TILE5D_GET_DATA_ORDER(x) ((x)->dataOrder) +#define XAI_TILE5D_SET_DATA_ORDER(x, v) ((x)->dataOrder = (v)) +#define XAI_TILE5D_GET_ELEMENT_TYPE XAI_TILE2D_GET_ELEMENT_TYPE +#define XAI_TILE5D_GET_ELEMENT_SIZE XAI_TILE2D_GET_ELEMENT_SIZE + +#if USE_64BIT_COEFF +#define xai_pArray_coeff xai_pArray_coeff_64 +#define xai_pTile3D_coeff xai_pTile3D_64 +#define xai_pTile4D_coeff xai_pTile4D_64 +#else +#define xai_pArray_coeff xai_pArray_coeff_32 +#define xai_pTile3D_coeff xai_pTile3D +#define xai_pTile4D_coeff xai_pTile4D +#endif // #if USE_64BIT_COEFF +#endif // #ifndef __XAI_TILE_MANAGER_H__ diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c new file mode 100644 index 00000000000..92fd72ebb91 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.c @@ -0,0 +1,1622 @@ +/* + * Copyright (c) 2025 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" +#include + +/* ----------------------------------------------------------------------------------------------------------------------- */ +#if XCHAL_HAVE_VISION // Optimized code is called for Vision DSPs +/* ----------------------------------------------------------------------------------------------------------------------- */ +#include "cnn_cast_scalar.h" + +#ifdef IN_DATA_TYPE +#undef IN_DATA_TYPE +#endif +#ifdef OUT_DATA_TYPE +#undef OUT_DATA_TYPE +#endif + +#define IN_DATA_TYPE UNSIGNED8BIT +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif // #ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + + +#define IN_DATA_TYPE SIGNED8BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + + +#define IN_DATA_TYPE UNSIGNED16BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + + +#define IN_DATA_TYPE SIGNED16BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + + +#define IN_DATA_TYPE UNSIGNED32BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + + +#define IN_DATA_TYPE SIGNED32BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define IN_DATA_TYPE UNSIGNED64BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE + + +#define IN_DATA_TYPE SIGNED64BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define IN_DATA_TYPE FLOAT16BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE +#endif //#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + + +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#define IN_DATA_TYPE FLOAT32BIT +#define OUT_DATA_TYPE UNSIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED8BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE UNSIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED32BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#ifdef IVP_LAVN_4X64U_XP +#define OUT_DATA_TYPE UNSIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE + +#define OUT_DATA_TYPE SIGNED64BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif //#ifdef IVP_LAVN_4X64U_XP + +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#define OUT_DATA_TYPE FLOAT16BIT +#include "cnn_cast.h" +#undef OUT_DATA_TYPE +#endif +#undef IN_DATA_TYPE +#endif //#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + +/**************************** xaiCast3D *****************************************/ +/* Description : General API for data casting */ +/* Inputs : inTile */ +/* Outputs : XAI Error Code */ +/* InOuts : outTile */ +/********************************************************************************/ +XAI_ERR_TYPE xaiCast3D(const xai_pTile3D inTile, + xai_pTile3D outTile) +{ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(inTile); + XAI_CHECK_POINTER(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nInput Data Order %d and Output Data Order %d are not same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U8)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_S8: + xaiCast3DFromU8ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromU8ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromU8ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromU8ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromU8ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromU8ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromU8ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromU8ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromU8ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S8)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromS8ToU8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromS8ToU16(inTile, outTile); + case XAI_S16: + xaiCast3DFromS8ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromS8ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromS8ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromS8ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromS8ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromS8ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromS8ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U16)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromU16ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromU16ToS8(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromU16ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromU16ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromU16ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromU16ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromU16ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromU16ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromU16ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S16)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromS16ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromS16ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromS16ToU16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromS16ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromS16ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromS16ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromS16ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromS16ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromS16ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U32)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromU32ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromU32ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromU32ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromU32ToS16(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromU32ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromU32ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromU32ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromU32ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromU32ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S32)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromS32ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromS32ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromS32ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromS32ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromS32ToU32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromS32ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromS32ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromS32ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromS32ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_U64)) + { +#ifdef IVP_LAVN_4X64U_XP + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromU64ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromU64ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromU64ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromU64ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromU64ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromU64ToS32(inTile, outTile); + break; + case XAI_S64: + xaiCast3DFromU64ToS64(inTile, outTile); + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromU64ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromU64ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } +#else //#ifdef IVP_LAVN_4X64U_XP + if (!XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)) + { + xaiCast3DScalar_I64(inTile, outTile); + } + else + { + return(XAI_ERR_DATATYPE); + } +#endif //#ifdef IVP_LAVN_4X64U_XP + } + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_S64)) + { +#ifdef IVP_LAVN_4X64U_XP + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromS64ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromS64ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromS64ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromS64ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromS64ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromS64ToS32(inTile, outTile); + break; + case XAI_U64: + xaiCast3DFromS64ToU64(inTile, outTile); + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromS64ToF16(inTile, outTile); + break; +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromS64ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } +#else //#ifdef IVP_LAVN_4X64U_XP + if (!XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) + { + xaiCast3DScalar_I64(inTile, outTile); + } + else + { + return(XAI_ERR_DATATYPE); + } +#endif //#ifdef IVP_LAVN_4X64U_XP + } +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F16)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromF16ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromF16ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromF16ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromF16ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromF16ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromF16ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromF16ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromF16ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + case XAI_F32: + xaiCast3DFromF16ToF32(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } +#endif +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) + else if (XAI_TILE3D_CHECK_TYPE(inTile, XAI_F32)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(outTile)) + { + case XAI_U8: + xaiCast3DFromF32ToU8(inTile, outTile); + break; + case XAI_S8: + xaiCast3DFromF32ToS8(inTile, outTile); + break; + case XAI_U16: + xaiCast3DFromF32ToU16(inTile, outTile); + break; + case XAI_S16: + xaiCast3DFromF32ToS16(inTile, outTile); + break; + case XAI_U32: + xaiCast3DFromF32ToU32(inTile, outTile); + break; + case XAI_S32: + xaiCast3DFromF32ToS32(inTile, outTile); + break; + case XAI_U64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromF32ToU64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; + case XAI_S64: +#ifdef IVP_LAVN_4X64U_XP + xaiCast3DFromF32ToS64(inTile, outTile); +#else + xaiCast3DScalar_I64(inTile, outTile); +#endif + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) + case XAI_F16: + xaiCast3DFromF32ToF16(inTile, outTile); + break; +#endif + default: + return(XAI_ERR_DATATYPE); + break; + } + } +#endif + else + { + return(XAI_ERR_DATATYPE); + } + return(XAI_ERROR_STATUS()); +} + +/* ----------------------------------------------------------------------------------------------------------------------- */ +#else // Call the reference code only for MathX DSPs for now +/* ----------------------------------------------------------------------------------------------------------------------- */ +#if ((XCHAL_HAVE_CONNX_B_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +static float fp32_from_bits1(uint32_t w) +{ + union + { + uint32_t as_bits; + float as_value; + } fp32 = { w }; + return(fp32.as_value); +} + +static uint32_t fp32_to_bits1(float f) +{ + union + { + float as_value; + uint32_t as_bits; + } fp32 = { f }; + return(fp32.as_bits); +} + +static float convert_fp16_to_fp32(uint16_t h) +{ + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + const uint32_t exp_offset = UINT32_C(0xE0) << 23; + const float exp_scale = fp32_from_bits1(UINT32_C(0x7800000)); + const float normalized_value = fp32_from_bits1((two_w >> 4) + exp_offset) * exp_scale; + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits1((two_w >> 17) | magic_mask) - magic_bias; + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits1(denormalized_value) : fp32_to_bits1(normalized_value)); + return(fp32_from_bits1(result)); +} + +static uint16_t convert_fp32_to_fp16(float f) +{ + const float scale_to_inf = fp32_from_bits1(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits1(UINT32_C(0x08800000)); + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = (uint32_t) fp32_to_bits1(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) + { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits1((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits1(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return((sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign)); +} + +/**************************** xaiCast3D *****************************************/ +/* Description : General API for data casting */ +/* Inputs : inTile */ +/* Outputs : XAI Error Code */ +/* InOuts : outTile */ +/********************************************************************************/ +XAI_ERR_TYPE xaiCast3D(const xai_pTile3D inTile, + xai_pTile3D outTile) +{ + XAI_ERROR_CHECKS() + { + XAI_CHECK_POINTER(inTile); + XAI_CHECK_POINTER(outTile); + XAI_CHECK_TILE3D_SIZE_EQ(inTile, outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nInput Data Order %d and Output Data Order %d are not same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile), XAI_TILE3D_GET_DATA_ORDER(outTile)); + } + + /* Get tile parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Input data pointers */ + uint8_t *pIn_8bU = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pIn_8b = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint16_t *pIn_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pIn_16b = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint32_t *pIn_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int32_t *pIn_32b = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint64_t *pIn_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pIn_64b = (int64_t *) XAI_TILE3D_GET_DATA_PTR(inTile); +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + xb_f16 *pIn_f16b = (xb_f16 *) XAI_TILE3D_GET_DATA_PTR(inTile); +#endif + float *pIn_f32b = (float *) XAI_TILE3D_GET_DATA_PTR(inTile); + + /* Output data pointers */ + uint8_t *pOut_8bU = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pOut_8b = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + uint16_t *pOut_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int16_t *pOut_16b = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + uint32_t *pOut_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t *pOut_32b = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + uint64_t *pOut_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t *pOut_64b = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + xb_f16 *pOut_f16b = (xb_f16 *) XAI_TILE3D_GET_DATA_PTR(outTile); +#endif + float *pOut_f32b = (float *) XAI_TILE3D_GET_DATA_PTR(outTile); + + uint16_t temp; + int32_t x, y, z; + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + for (x = 0; x < dim1Size; x++) /* along 1st dimension */ + { + // Conversions to U64 + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> U64 + case XAI_U8: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> U64 + case XAI_S8: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> U64 + case XAI_U16: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> U64 + case XAI_S16: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> U64 + case XAI_U32: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> U64 + case XAI_S32: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U64 + case XAI_S64: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> U64 + case XAI_F32: + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F16 -> U64 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); // Strict Aliasing Rule, TENX-63685 + pOut_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to S64 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> S64 + case XAI_U8: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> S64 + case XAI_S8: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> S64 + case XAI_U16: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> S64 + case XAI_S16: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> S64 + case XAI_U32: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> S64 + case XAI_S32: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S64 + case XAI_U64: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> S64 + case XAI_F32: + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F16 -> S64 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to U32 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> U32 + case XAI_U8: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> U32 + case XAI_S8: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> U32 + case XAI_U16: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> U32 + case XAI_S16: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> U32 + case XAI_S32: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> U32 + case XAI_U64: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U32 + case XAI_S64: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> U32 + case XAI_F32: + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F16 -> U32 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to S32 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> S32 + case XAI_U8: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> S32 + case XAI_S8: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> S32 + case XAI_U16: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> S32 + case XAI_S16: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> S32 + case XAI_U32: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> S32 + case XAI_U64: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S32 + case XAI_S64: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> S32 + case XAI_F32: + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F16 -> S32 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to U16 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> U16 + case XAI_U8: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> U16 + case XAI_S8: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> U16 + case XAI_S16: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> U16 + case XAI_U32: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> U16 + case XAI_S32: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> U16 + case XAI_U64: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U16 + case XAI_S64: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> U16 + case XAI_F32: + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F32 -> U16 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to S16 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> S16 + case XAI_U8: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> S16 + case XAI_S8: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> S16 + case XAI_U16: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> S16 + case XAI_U32: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> S16 + case XAI_S32: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> S16 + case XAI_U64: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S16 + case XAI_S64: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> S16 + case XAI_F32: + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F32 -> S16 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to U8 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // S8 -> U8 + case XAI_S8: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> U8 + case XAI_U16: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> U8 + case XAI_S16: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> U8 + case XAI_U32: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> U8 + case XAI_S32: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> U8 + case XAI_U64: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U8 + case XAI_S64: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> U8 + case XAI_F32: + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F32 -> U8 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to S8 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> S8 + case XAI_U8: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> S8 + case XAI_U16: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> S8 + case XAI_S16: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> S8 + case XAI_U32: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> S8 + case XAI_S32: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> S8 + case XAI_U64: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S8 + case XAI_S64: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> S8 + case XAI_F32: + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F32 -> S8 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to F32 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F32)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> F32 + case XAI_U8: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> F32 + case XAI_S8: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> F32 + case XAI_U16: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> F32 + case XAI_S16: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> F32 + case XAI_U32: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> F32 + case XAI_S32: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // U64 -> F32 + case XAI_U64: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> F32 + case XAI_S64: + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + // F16 -> F32 + case XAI_F16: + memcpy(&temp, &pIn_f16b[z * inPitch2 + y * inPitch1 + x], 2); + pOut_f32b[z * outPitch2 + y * outPitch1 + x] = (float) convert_fp16_to_fp32(temp); + break; +#endif + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } + // Conversions to F16 +#if (XCHAL_HAVE_CONNX_B_HP_VFPU == 1) + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F16)) + { + switch (XAI_TILE3D_GET_ELEMENT_TYPE(inTile)) + { + // U8 -> F16 + case XAI_U8: + temp = convert_fp32_to_fp16((float) pIn_8bU[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); // Strict Aliasing Rule, TENX-63685 + break; + // S8 -> F16 + case XAI_S8: + temp = convert_fp32_to_fp16((float) pIn_8b[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // U16 -> F16 + case XAI_U16: + temp = convert_fp32_to_fp16((float) pIn_16bU[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // S16 -> F16 + case XAI_S16: + temp = convert_fp32_to_fp16((float) pIn_16b[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // U32 -> F16 + case XAI_U32: + temp = convert_fp32_to_fp16((float) pIn_32bU[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // S32 -> F16 + case XAI_S32: + temp = convert_fp32_to_fp16((float) pIn_32b[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // U64 -> F16 + case XAI_U64: + temp = convert_fp32_to_fp16((float) pIn_64bU[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // S64 -> F16 + case XAI_S64: + temp = convert_fp32_to_fp16((float) pIn_64b[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + // F32 -> F16 + case XAI_F32: + temp = convert_fp32_to_fp16((float) pIn_f32b[z * inPitch2 + y * inPitch1 + x]); + memcpy(&pOut_f16b[z * outPitch2 + y * outPitch1 + x], &temp, 2); + break; + default: + return(XAI_ERR_NO_VARIANT); + break; + } + } +#endif + } /* end (x = 0; x < dim1Size; x++) loop */ + } /* end (y = 0; y < dim2Size; y++) loop */ + } /* end (z = 0; z < dim3Size; z++) loop */ + + return(XAI_ERROR_STATUS()); +} +#endif // #if (((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) || ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1))) +/* ----------------------------------------------------------------------------------------------------------------------- */ +#endif // #if XCHAL_HAVE_VISION +/* ----------------------------------------------------------------------------------------------------------------------- */ diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h new file mode 100644 index 00000000000..22b20bdec10 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast.h @@ -0,0 +1,1890 @@ +/* + * Copyright (c) 2025 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + +#ifndef IVP_UNPKU2NX8U_L +#define IVP_UNPKU2NX8U_L(vecIn) xb_vecNx16_rtor_xb_vecNx16U(IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO))) +#endif + +#ifndef IVP_UNPKU2NX8_L +#define IVP_UNPKU2NX8_L(vecIn) IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO)) +#endif + +#ifndef IVP_UNPKS2NX8_L +#define IVP_UNPKS2NX8_L(vecIn) IVP_MOVNX16_FROM2NX8(IVP_SEL2NX8I(IVP_SRAI2NX8(vecIn, 7), vecIn, IVP_SELI_8B_INTERLEAVE_1_LO)) +#endif + +#ifndef IVP_UNPKUNX16U_L +#define IVP_UNPKUNX16U_L(vecIn) IVP_MOVN_2X32U_FROMNX16(IVP_SELNX16UI(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)) +#endif + +#ifndef IVP_UNPKSNX16_L +#define IVP_UNPKSNX16_L(vecIn) IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(IVP_SRAINX16(vecIn, 15), vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)) +#endif + +#define UNPKSNX8_L(vecIn) IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(IVP_SRAIN_2X32(IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn)), 31), IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn)), IVP_SELI_32B_INTERLEAVE_1_LO)) +#define UNPKSNX16_L(vecIn) IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(IVP_SRAIN_2X32(IVP_UNPKSNX16_L(vecIn), 31), IVP_UNPKSNX16_L(vecIn), IVP_SELI_32B_INTERLEAVE_1_LO)) +#define UNPKSNX32_L(vecIn) IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(IVP_SRAIN_2X32(vecIn, 31), vecIn, IVP_SELI_32B_INTERLEAVE_1_LO)) + +#if IN_DATA_TYPE == UNSIGNED8BIT +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR uint8_t +#define MORPH_IDT_VECTOR xb_vec2Nx8U +#define MORPH_IP_PRIME IVP_LA2NX8U_PP +#define MORPH_IP_VAR_LOAD IVP_LAV2NX8U_XP + +#if OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToS8 +#define MORPH_VECTORIZATIONWIDTH 2 * XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vec2Nx8U_rtor_xb_vec2Nx8(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKU2NX8U_L(vecIn) +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKU2NX8_L(vecIn) +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn)) +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn)) +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, vecIn, IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO), IVP_SELI_8B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2x32Uv temp = IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn)); \ + xb_vecNxf16 temp1 = IVP_CVTF16N_2XF32_0(xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(temp)); \ + vecOut = IVP_SELNXF16I(0, temp1, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU8ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(IVP_UNPKUNX16U_L(IVP_UNPKU2NX8U_L(vecIn))); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif + + +#elif IN_DATA_TYPE == SIGNED8BIT +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR int8_t +#define MORPH_IDT_VECTOR xb_vec2Nx8 +#define MORPH_IP_PRIME IVP_LA2NX8_PP +#define MORPH_IP_VAR_LOAD IVP_LAV2NX8_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToU8 +#define MORPH_VECTORIZATIONWIDTH 2 * XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vec2Nx8_rtor_xb_vec2Nx8U(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecNx16_rtor_xb_vecNx16U(IVP_UNPKS2NX8_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKS2NX8_L(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_UNPKSNX16_L((IVP_UNPKS2NX8_L(vecIn)))); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKSNX16_L((IVP_UNPKS2NX8_L(vecIn))); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64U_FROMNX16(UNPKSNX8_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64_FROMNX16(UNPKSNX8_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2x32v temp = IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn)); \ + xb_vecNxf16 temp1 = IVP_CVTF16N_2XF32_0(xb_vecN_2x32v_rtor_xb_vecN_2xf32(temp)); \ + vecOut = IVP_SELNXF16I(0, temp1, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS8ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_UNPKSNX16_L(IVP_UNPKS2NX8_L(vecIn))); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif + + +#elif IN_DATA_TYPE == UNSIGNED16BIT +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR uint16_t +#define MORPH_IDT_VECTOR xb_vecNx16U +#define MORPH_IP_PRIME IVP_LANX16U_PP +#define MORPH_IP_VAR_LOAD IVP_LAVNX16U_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOV2NX8U_FROMNX16(IVP_SELNX16UI(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0)); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOV2NX8_FROMNX16(IVP_SELNX16UI(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0)); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecNx16U_rtor_xb_vecNx16(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKUNX16U_L(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2x32v(IVP_UNPKUNX16U_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_SELNX16UI(0, IVP_SELNX16UI(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO), IVP_SELI_16B_INTERLEAVE_1_LO)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64_FROMNX16(IVP_SELNX16UI(0, IVP_SELNX16UI(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO), IVP_SELI_16B_INTERLEAVE_1_LO)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(IVP_UNPKUNX16U_L(vecIn))); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU16ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(IVP_UNPKUNX16U_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif + + +#elif IN_DATA_TYPE == SIGNED16BIT +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR int16_t +#define MORPH_IDT_VECTOR xb_vecNx16 +#define MORPH_IP_PRIME IVP_LANX16_PP +#define MORPH_IP_VAR_LOAD IVP_LAVNX16_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOV2NX8U_FROMNX16(IVP_SELNX16I(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0)); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOV2NX8_FROMNX16(IVP_SELNX16I(0, vecIn, IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0)); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecNx16_rtor_xb_vecNx16U(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_UNPKSNX16_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_UNPKSNX16_L(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64U_FROMNX16(UNPKSNX16_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64_FROMNX16(UNPKSNX16_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_UNPKSNX16_L(vecIn))); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS16ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_UNPKSNX16_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif + + +#elif IN_DATA_TYPE == UNSIGNED32BIT +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR uint32_t +#define MORPH_IDT_VECTOR xb_vecN_2x32Uv +#define MORPH_IP_PRIME IVP_LAN_2X32U_PP +#define MORPH_IP_VAR_LOAD IVP_LAVN_2X32U_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2x32v(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROMN_2X32U(IVP_SELN_2X32UI(0, vecIn, IVP_SELI_32B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32I(0, vecIn, IVP_SELI_32B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(vecIn)); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU32ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32Uv_rtor_xb_vecN_2xf32(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif + + +#elif IN_DATA_TYPE == SIGNED32BIT +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR int32_t +#define MORPH_IDT_VECTOR xb_vecN_2x32v +#define MORPH_IP_PRIME IVP_LAN_2X32_PP +#define MORPH_IP_VAR_LOAD IVP_LAVN_2X32_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64U_FROMNX16(UNPKSNX32_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_MOVN_4X64_FROMNX16(UNPKSNX32_L(vecIn)); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(xb_vecN_2x32v_rtor_xb_vecN_2xf32(vecIn)); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS32ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif + + +#elif IN_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR uint64_t +#define MORPH_IDT_VECTOR xb_vecN_4x64U +#define MORPH_IP_PRIME IVP_LAN_4X64U_PP +#define MORPH_IP_VAR_LOAD IVP_LAVN_4X64U_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16UI(0, IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_4X64U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16I(0, IVP_SELNX16I(0, IVP_MOVNX16_FROMN_4X64U(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELN_2X32UI(0, IVP_MOVN_2X32U_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_4x64U_rtor_xb_vecN_4x64(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN)); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromU64ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64U(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif //#ifdef IVP_LAVN_4X64U_XP +#endif + + +#elif IN_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR int64_t +#define MORPH_IDT_VECTOR xb_vecN_4x64 +#define MORPH_IP_PRIME IVP_LAN_4X64_PP +#define MORPH_IP_VAR_LOAD IVP_LAVN_4X64_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16UI(0, IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_4X64(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16I(0, IVP_SELNX16I(0, IVP_MOVNX16_FROMN_4X64(vecIn), IVP_SELI_16B_DEINTERLEAVE_1_EVEN), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELN_2X32UI(0, IVP_MOVN_2X32U_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_4x64_rtor_xb_vecN_4x64U(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN)); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromS64ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2x32v_rtor_xb_vecN_2xf32(IVP_SELN_2X32I(0, IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROMN_4X64(vecIn)), IVP_SELI_32B_DEINTERLEAVE_1_EVEN)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif //#ifdef IVP_LAVN_4X64U_XP +#endif + + +#elif IN_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR xb_f16 +#define MORPH_IDT_VECTOR xb_vecNxf16 +#define MORPH_IP_PRIME IVP_LANXF16_PP +#define MORPH_IP_VAR_LOAD IVP_LAVNXF16_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \ + vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \ + vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \ + vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2xf32 temp = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); \ + vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(temp)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32Uv(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32v(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32v(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))); \ + xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31); \ + vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32v(IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO))); \ + xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31); \ + vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF16ToF32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR float +#define MORPH_ODT_VECTOR xb_vecN_2xf32 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_CVTF32NXF16_0(IVP_SELNXF16I(0, vecIn, IVP_SELI_16B_INTERLEAVE_1_LO)); +#define MORPH_OP_VAR_STORE IVP_SAVN_2XF32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2XF32_FP +#endif +#endif +#endif + + +#elif IN_DATA_TYPE == FLOAT32BIT +#if ((XCHAL_HAVE_VISION_SP_VFPU == 1) || (XCHAL_HAVE_BBENEP_SP_VFPU == 1)) +#undef MORPH_IDT_SCALAR +#undef MORPH_IDT_VECTOR +#undef MORPH_IP_PRIME +#undef MORPH_IP_VAR_LOAD + +#define MORPH_IDT_SCALAR float +#define MORPH_IDT_VECTOR xb_vecN_2xf32 +#define MORPH_IP_PRIME IVP_LAN_2XF32_PP +#define MORPH_IP_VAR_LOAD IVP_LAVN_2XF32_XP + +#if OUT_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToU8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8UI(0, IVP_SEL2NX8UI(0, IVP_MOV2NX8U_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8U_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8U_FP + +#elif OUT_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToS8 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int8_t +#define MORPH_ODT_VECTOR xb_vec2Nx8 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SEL2NX8I(0, IVP_SEL2NX8I(0, IVP_MOV2NX8_FROMNX16(IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn))), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0); +#define MORPH_OP_VAR_STORE IVP_SAV2NX8_XP +#define MORPH_OP_FLUSH IVP_SAPOS2NX8_FP + +#elif OUT_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToU16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint16_t +#define MORPH_ODT_VECTOR xb_vecNx16U +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16UI(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16U_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16U_FP + +#elif OUT_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToS16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int16_t +#define MORPH_ODT_VECTOR xb_vecNx16 +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = IVP_SELNX16I(0, IVP_MOVNX16_FROMN_2X32(xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn)), IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNX16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNX16_FP + +#elif OUT_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToU32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR uint32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32Uv +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32Uv(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32U_FP + +#elif OUT_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToS32 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR int32_t +#define MORPH_ODT_VECTOR xb_vecN_2x32v +#define MORPH_IDT_CAST(vecIn, vecOut) vecOut = xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn); +#define MORPH_OP_VAR_STORE IVP_SAVN_2X32_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_2X32_FP + +#elif OUT_DATA_TYPE == UNSIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToU64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR uint64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64U +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32Uv(vecIn); \ + xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31); \ + vecOut = IVP_MOVN_4X64U_FROMNX16(IVP_MOVNX16_FROMN_2X32U(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64U_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64U_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == SIGNED64BIT +#ifdef IVP_LAVN_4X64U_XP +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToS64 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 4 +#define MORPH_ODT_SCALAR int64_t +#define MORPH_ODT_VECTOR xb_vecN_4x64 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecN_2x32v vecInp = xb_vecN_2xf32_rtor_xb_vecN_2x32v(vecIn); \ + xb_vecN_2x32v sign = IVP_SRAIN_2X32(vecInp, 31); \ + vecOut = IVP_MOVN_4X64_FROMNX16(IVP_MOVNX16_FROMN_2X32(IVP_SELN_2X32UI(sign, vecInp, IVP_SELI_32B_INTERLEAVE_1_LO))); +#define MORPH_OP_VAR_STORE IVP_SAVN_4X64_XP +#define MORPH_OP_FLUSH IVP_SAPOSN_4X64_FP +#endif //#ifdef IVP_LAVN_4X64U_XP + +#elif OUT_DATA_TYPE == FLOAT16BIT +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_BBENEP_HP_VFPU == 1)) +#undef MAKE_NAME +#undef MORPH_VECTORIZATIONWIDTH +#undef MORPH_ODT_SCALAR +#undef MORPH_ODT_VECTOR +#undef MORPH_IDT_CAST +#undef MORPH_OP_VAR_STORE +#undef MORPH_OP_FLUSH + +#define MAKE_NAME(name) name ## FromF32ToF16 +#define MORPH_VECTORIZATIONWIDTH XCHAL_IVPN_SIMD_WIDTH / 2 +#define MORPH_ODT_SCALAR xb_f16 +#define MORPH_ODT_VECTOR xb_vecNxf16 +#define MORPH_IDT_CAST(vecIn, vecOut) \ + xb_vecNxf16 temp = IVP_CVTF16N_2XF32_0(vecIn); \ + vecOut = IVP_SELNXF16I(0, temp, IVP_SELI_16B_DEINTERLEAVE_1_EVEN); +#define MORPH_OP_VAR_STORE IVP_SAVNXF16_XP +#define MORPH_OP_FLUSH IVP_SAPOSNXF16_FP +#endif +#endif +#endif +#endif + +/**************************** xaiCast3D *****************************************/ +/* Description : Data casting implementation for input and output data Type */ +/* of S8, U8, S16, U16, S32, U32, U64, S64, F16 and F32 */ +/* Inputs : inTile */ +/* Outputs : void */ +/* InOuts : outTile */ +/********************************************************************************/ + +void MAKE_NAME (xaiCast3D)(const xai_pTile3D inTile, xai_pTile3D outTile) +{ + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + MORPH_IDT_SCALAR* pInput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile); + MORPH_ODT_SCALAR* pOutput = (MORPH_ODT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_VECTOR *__restrict pdvecIn; + MORPH_ODT_VECTOR *__restrict pdvecOut; + + valign vaOutData = IVP_ZALIGN(); + MORPH_IDT_VECTOR vecInData; + MORPH_ODT_VECTOR vecOutData; + + if ((inPitch2 == (dim1Size * dim2Size)) && (outPitch2 == inPitch2)) + { + int dimsCount = dim1Size * dim2Size * dim3Size; + int x; + pdvecIn = (MORPH_IDT_VECTOR *) pInput; + pdvecOut = (MORPH_ODT_VECTOR *) pOutput; + valign vaInData = MORPH_IP_PRIME(pdvecIn); + + for (x = 0; x < dimsCount; x += MORPH_VECTORIZATIONWIDTH) + { + int remLen = MIN2(dimsCount - x, MORPH_VECTORIZATIONWIDTH); + MORPH_IP_VAR_LOAD(vecInData, vaInData, pdvecIn, sizeof(MORPH_IDT_SCALAR) * remLen); + MORPH_IDT_CAST(vecInData, vecOutData); + MORPH_OP_VAR_STORE(vecOutData, vaOutData, pdvecOut, sizeof(MORPH_ODT_SCALAR) * remLen); + } + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } + else + { + int x, y, z; + for (z = 0; z < dim3Size; z++) + { + for (y = 0; y < dim2Size; y++) + { + MORPH_IDT_SCALAR* pIn = pInput + z * inPitch2 + y * inPitch1; + MORPH_ODT_SCALAR* pOut = pOutput + z * outPitch2 + y * outPitch1; + + pdvecIn = (MORPH_IDT_VECTOR *) pIn; + pdvecOut = (MORPH_ODT_VECTOR *) pOut; + valign vaInData = MORPH_IP_PRIME(pdvecIn); + + for (x = 0; x < dim1Size; x += MORPH_VECTORIZATIONWIDTH) + { + int remLen = MIN2(dim1Size - x, MORPH_VECTORIZATIONWIDTH); + MORPH_IP_VAR_LOAD(vecInData, vaInData, pdvecIn, sizeof(MORPH_IDT_SCALAR) * remLen); + MORPH_IDT_CAST(vecInData, vecOutData); + MORPH_OP_VAR_STORE(vecOutData, vaOutData, pdvecOut, sizeof(MORPH_ODT_SCALAR) * remLen); + } + MORPH_OP_FLUSH(vaOutData, pdvecOut); + } // end of for(y = 0; y < dim2Size; y++) + } // end of for(z = 0; z < dim3Size; z++) + } // end of if((inPitch2 == (dim1Size * dim2Size)) && (outPitch2 == inPitch2)) + + return; +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h new file mode 100644 index 00000000000..39ed7230bc8 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_cast_scalar.h @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2025 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + +/**************************** xaiCast3DScalar_I64 *******************************/ +/* Description : Data casting scalar implementation for the case when */ +/* input or output data Type are U64 or S64 */ +/* Inputs : inTile */ +/* Outputs : void */ +/* InOuts : outTile */ +/********************************************************************************/ + +void xaiCast3DScalar_I64(const xai_pTile3D inTile, + xai_pTile3D outTile) +{ + /* Get Tile Parameters */ + const int32_t dim1Size = XAI_TILE3D_GET_DIM1(inTile); + const int32_t dim2Size = XAI_TILE3D_GET_DIM2(inTile); + const int32_t dim3Size = XAI_TILE3D_GET_DIM3(inTile); + const int32_t inPitch1 = XAI_TILE3D_GET_DIM1_PITCH(inTile); + const int32_t inPitch2 = XAI_TILE3D_GET_DIM2_PITCH(inTile); + const int32_t outPitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outPitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + uint8_t *pIn_8bU = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int8_t *pIn_8b = (int8_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint16_t *pIn_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int16_t *pIn_16b = (int16_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint32_t *pIn_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int32_t *pIn_32b = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + uint64_t *pIn_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(inTile); + int64_t *pIn_64b = (int64_t *) XAI_TILE3D_GET_DATA_PTR(inTile); +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) + xb_f16 *pIn_f16b = (xb_f16 *) XAI_TILE3D_GET_DATA_PTR(inTile); +#endif + float *pIn_f32b = (float *) XAI_TILE3D_GET_DATA_PTR(inTile); + + uint8_t *pout_8bU = (uint8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int8_t *pout_8b = (int8_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + uint16_t *pout_16bU = (uint16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int16_t *pout_16b = (int16_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + uint32_t *pout_32bU = (uint32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int32_t *pout_32b = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + uint64_t *pout_64bU = (uint64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + int64_t *pout_64b = (int64_t *) XAI_TILE3D_GET_DATA_PTR(outTile); +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) + xb_f16 *pout_f16b = (xb_f16 *) XAI_TILE3D_GET_DATA_PTR(outTile); +#endif + float *pout_f32b = (float *) XAI_TILE3D_GET_DATA_PTR(outTile); + + int32_t x, y, z; + + for (z = 0; z < dim3Size; z++) /* along 3rd dimension */ + { + for (y = 0; y < dim2Size; y++) /* along 2nd dimension */ + { + for (x = 0; x < dim1Size; x++) /* along 1st dimension */ + { + // Conversions to U64 + if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U64)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U8 -> U64 + case XAI_U8: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> U64 + case XAI_S8: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> U64 + case XAI_U16: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> U64 + case XAI_S16: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> U64 + case XAI_U32: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> U64 + case XAI_S32: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U64 + case XAI_S64: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> U64 + case XAI_F32: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) + // F16 -> U64 + case XAI_F16: + pout_64bU[z * outPitch2 + y * outPitch1 + x] = (uint64_t) IVP_CVTF32F16(pIn_f16b[z * inPitch2 + y * inPitch1 + x]); + break; +#endif + default: + break; + } + } + // Conversions to S64 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S64)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U8 -> S64 + case XAI_U8: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S8 -> S64 + case XAI_S8: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_8b[z * inPitch2 + y * inPitch1 + x]; + break; + // U16 -> S64 + case XAI_U16: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S16 -> S64 + case XAI_S16: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_16b[z * inPitch2 + y * inPitch1 + x]; + break; + // U32 -> S64 + case XAI_U32: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S32 -> S64 + case XAI_S32: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_32b[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S64 + case XAI_U64: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // F32 -> S64 + case XAI_F32: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) pIn_f32b[z * inPitch2 + y * inPitch1 + x]; + break; +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) + // F16 -> S64 + case XAI_F16: + pout_64b[z * outPitch2 + y * outPitch1 + x] = (int64_t) IVP_CVTF32F16(pIn_f16b[z * inPitch2 + y * inPitch1 + x]); + break; +#endif + default: + break; + } + } + // Conversions to S32 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U32)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> U32 + case XAI_U64: + pout_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U32 + case XAI_S64: + pout_32bU[z * outPitch2 + y * outPitch1 + x] = (uint32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to S32 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S32)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> S32 + case XAI_U64: + pout_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S32 + case XAI_S64: + pout_32b[z * outPitch2 + y * outPitch1 + x] = (int32_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to U16 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U16)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> U16 + case XAI_U64: + pout_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U16 + case XAI_S64: + pout_16bU[z * outPitch2 + y * outPitch1 + x] = (uint16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to S16 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S16)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> S16 + case XAI_U64: + pout_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S16 + case XAI_S64: + pout_16b[z * outPitch2 + y * outPitch1 + x] = (int16_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to U8 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_U8)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> U8 + case XAI_U64: + pout_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> U8 + case XAI_S64: + pout_8bU[z * outPitch2 + y * outPitch1 + x] = (uint8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to S8 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_S8)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> S8 + case XAI_U64: + pout_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> S8 + case XAI_S64: + pout_8b[z * outPitch2 + y * outPitch1 + x] = (int8_t) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to F32 + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F32)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> F32 + case XAI_U64: + pout_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64bU[z * inPitch2 + y * inPitch1 + x]; + break; + // S64 -> F32 + case XAI_S64: + pout_f32b[z * outPitch2 + y * outPitch1 + x] = (float) pIn_64b[z * inPitch2 + y * inPitch1 + x]; + break; + default: + break; + } + } + // Conversions to F16 +#if ((XCHAL_HAVE_VISION_HP_VFPU == 1) || (XCHAL_HAVE_CONNX_B_HP_VFPU == 1)) + else if (XAI_TILE3D_CHECK_TYPE(outTile, XAI_F16)) + { + switch (XAI_TYPE_ELEMENT_TYPE(XAI_TILE3D_GET_TYPE(inTile))) + { + // U64 -> F16 + case XAI_U64: + pout_f16b[z * outPitch2 + y * outPitch1 + x] = IVP_CVTF16F32((float) pIn_64bU[z * inPitch2 + y * inPitch1 + x]); + break; + // S64 -> F16 + case XAI_S64: + pout_f16b[z * outPitch2 + y * outPitch1 + x] = IVP_CVTF16F32((float) pIn_64b[z * inPitch2 + y * inPitch1 + x]); + break; + default: + break; + } + } +#endif + } /* end (x = 0; x < dim1Size; x++) loop */ + } /* end (y = 0; y < dim2Size; y++) loop */ + } /* end (z = 0; z < dim3Size; z++) loop */ + return; +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c new file mode 100644 index 00000000000..9c540ab97eb --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTADD_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE + +#define ELTADD_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE + +#define ELTADD_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE + +#define ELTADD_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE + +#define ELTADD_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE + +#define ELTADD_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELTADD_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELTADD_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_add.h" +#undef ELTADD_DATA_TYPE +#endif + +/**************************** xaiEltwiseAdd3D_AV *****************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise addition */ +/* Calls one of the xaiEltwiseAdd3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseAdd3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseAdd3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseAdd3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseAdd3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseAdd3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseAdd3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseAdd3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseAdd3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseAdd3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h new file mode 100644 index 00000000000..5157aa80360 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_add.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELTADD_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTADD_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTADD_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTADD_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTADD_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTADD_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELTADD_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELTADD_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + +#define ADD2(a, b) a + b + + +/**************************** xaiEltwiseAdd3D ********************************************/ +/* Description : auto-vectorizable implementation of Broadcast element-wise addition */ +/* Based on MORPH implementation eight variants are generated for */ +/* S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseAdd3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = ADD2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = ADD2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = ADD2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = ADD2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c new file mode 100644 index 00000000000..ee6c1fe3a3c --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + +#define ELTAND_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_and.h" +#undef ELTAND_DATA_TYPE + +#define ELTAND_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_and.h" +#undef ELTAND_DATA_TYPE + +#define ELTAND_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_and.h" +#undef ELTAND_DATA_TYPE + +#define ELTAND_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_and.h" +#undef ELTAND_DATA_TYPE + +#define ELTAND_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_and.h" +#undef ELTAND_DATA_TYPE + +#define ELTAND_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_and.h" +#undef ELTAND_DATA_TYPE + + +/**************************** xaiEltwiseAnd3D_AV *****************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise and */ +/* bitwise AND operator */ +/* Calls one of the xaiEltwiseAnd3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseAnd3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseAnd3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseAnd3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseAnd3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseAnd3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseAnd3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseAnd3D_U32_AV(inTile1, inTile2, outTile)); + } + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h new file mode 100644 index 00000000000..de51db0c785 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_and.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#endif + +#if ELTAND_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTAND_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTAND_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTAND_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTAND_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTAND_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t +#endif + +#define AND2(a, b) a & b + + +/**************************** xaiEltwiseAnd3D *********************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise and bitwise */ +/* AND operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, and U32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseAnd3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = AND2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = AND2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = AND2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = AND2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c new file mode 100644 index 00000000000..a8f18f62f1a --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTEQUAL_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE + +#define ELTEQUAL_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE + +#define ELTEQUAL_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE + +#define ELTEQUAL_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE + +#define ELTEQUAL_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE + +#define ELTEQUAL_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELTEQUAL_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE +#endif +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELTEQUAL_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_equal.h" +#undef ELTEQUAL_DATA_TYPE +#endif + + +/**************************** xaiEltwiseEqual3D_AV ***************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise */ +/* EQUAL operator */ +/* Calls one of the xaiEltwiseEqual3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseEqual3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseEqual3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseEqual3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseEqual3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseEqual3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseEqual3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseEqual3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseEqual3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseEqual3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h new file mode 100644 index 00000000000..f6f7efbdc85 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_equal.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELTEQUAL_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTEQUAL_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTEQUAL_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTEQUAL_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTEQUAL_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTEQUAL_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELTEQUAL_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELTEQUAL_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + +#define EQUAL(a, b) a == b + + +/**************************** xaiEltwiseEqual3D ******************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise EQUAL */ +/* operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseEqual3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { +#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT) + bool temp = EQUAL(pIn1[i], pIn2[i]); + pOut[i] = temp ? 1 : 0; +#else + pOut[i] = EQUAL(pIn1[i], pIn2[i]); +#endif + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; +#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT) + bool temp = EQUAL(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = EQUAL(InData1, InData2); +#endif + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; +#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT) + bool temp = EQUAL(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = EQUAL(InData1, InData2); +#endif + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; +#if (ELTEQUAL_DATA_TYPE == FLOAT16BIT) + bool temp = EQUAL(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = EQUAL(InData1, InData2); +#endif + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c new file mode 100644 index 00000000000..a7eedbf95bc --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELT_GREATERTHAN_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE + +#define ELT_GREATERTHAN_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE + +#define ELT_GREATERTHAN_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE + +#define ELT_GREATERTHAN_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE + +#define ELT_GREATERTHAN_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE + +#define ELT_GREATERTHAN_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELT_GREATERTHAN_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELT_GREATERTHAN_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_greaterthan.h" +#undef ELT_GREATERTHAN_DATA_TYPE +#endif + + +/**************************** xaiEltwiseGreaterThan3D_AV ***************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise */ +/* GREATER operator */ +/* Calls one of the xaiEltwiseGreaterThan3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/***********************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseGreaterThan3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseGreaterThan3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseGreaterThan3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseGreaterThan3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseGreaterThan3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseGreaterThan3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseGreaterThan3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseGreaterThan3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseGreaterThan3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h new file mode 100644 index 00000000000..08be132a40c --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_greaterthan.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELT_GREATERTHAN_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELT_GREATERTHAN_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELT_GREATERTHAN_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELT_GREATERTHAN_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELT_GREATERTHAN_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELT_GREATERTHAN_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELT_GREATERTHAN_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + +#define GREATER_THAN(a, b) a > b + + +/**************************** xaiEltwiseGreaterThan3D ************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise GREATER */ +/* operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseGreaterThan3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { +#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = GREATER_THAN(pIn1[i], pIn2[i]); + pOut[i] = temp ? 1 : 0; +#else + pOut[i] = GREATER_THAN(pIn1[i], pIn2[i]); +#endif + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; +#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = GREATER_THAN(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = GREATER_THAN(InData1, InData2); +#endif + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; +#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = GREATER_THAN(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = GREATER_THAN(InData1, InData2); +#endif + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; +#if (ELT_GREATERTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = GREATER_THAN(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = GREATER_THAN(InData1, InData2); +#endif + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c new file mode 100644 index 00000000000..da896ead3ca --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELT_LESSTHAN_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE + +#define ELT_LESSTHAN_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE + +#define ELT_LESSTHAN_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE + +#define ELT_LESSTHAN_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE + +#define ELT_LESSTHAN_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE + +#define ELT_LESSTHAN_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELT_LESSTHAN_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELT_LESSTHAN_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_lessthan.h" +#undef ELT_LESSTHAN_DATA_TYPE +#endif + +XAI_ERR_TYPE xaiEltwiseLessThan3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseLessThan3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseLessThan3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseLessThan3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseLessThan3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseLessThan3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseLessThan3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseLessThan3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseLessThan3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h new file mode 100644 index 00000000000..aab6c89d183 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_lessthan.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELT_LESSTHAN_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELT_LESSTHAN_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELT_LESSTHAN_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELT_LESSTHAN_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELT_LESSTHAN_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELT_LESSTHAN_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELT_LESSTHAN_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + +#define LESS_THAN(a, b) a < b + + +/**************************** xaiEltwiseLessThan3D ***************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise LESS */ +/* operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseLessThan3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { +#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = LESS_THAN(pIn1[i], pIn2[i]); + pOut[i] = temp ? 1 : 0; +#else + pOut[i] = LESS_THAN(pIn1[i], pIn2[i]); +#endif + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; +#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = LESS_THAN(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = LESS_THAN(InData1, InData2); +#endif + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; +#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = LESS_THAN(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = LESS_THAN(InData1, InData2); +#endif + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; +#if (ELT_LESSTHAN_DATA_TYPE == FLOAT16BIT) + bool temp = LESS_THAN(InData1, InData2); + pOut[idx] = temp ? 1 : 0; +#else + pOut[idx] = LESS_THAN(InData1, InData2); +#endif + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c new file mode 100644 index 00000000000..aa62692b567 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTMAX_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE + +#define ELTMAX_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE + +#define ELTMAX_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE + +#define ELTMAX_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE + +#define ELTMAX_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE + +#define ELTMAX_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELTMAX_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELTMAX_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_max.h" +#undef ELTMAX_DATA_TYPE +#endif + + +/**************************** xaiEltwiseMax3D_AV ***************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise */ +/* MAX operator */ +/* Calls one of the xaiEltwiseMax3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/***************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseMax3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseMax3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseMax3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseMax3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseMax3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseMax3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseMax3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseMax3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseMax3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h new file mode 100644 index 00000000000..f87622c40e3 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_max.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELTMAX_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTMAX_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTMAX_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTMAX_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTMAX_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTMAX_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELTMAX_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELTMAX_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + + +/**************************** xaiEltwiseMax3D ********************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise MAX */ +/* operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseMax3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = MAX2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = MAX2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = MAX2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = MAX2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c new file mode 100644 index 00000000000..60b4d1e0523 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTMIN_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE + +#define ELTMIN_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE + +#define ELTMIN_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE + +#define ELTMIN_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE + +#define ELTMIN_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE + +#define ELTMIN_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELTMIN_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELTMIN_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_min.h" +#undef ELTMIN_DATA_TYPE +#endif + + +/**************************** xaiEltwiseMin3D_AV *****************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise */ +/* MIN operator */ +/* Calls one of the xaiEltwiseMin3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseMin3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseMin3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseMin3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseMin3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseMin3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseMin3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseMin3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseMin3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseMin3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h new file mode 100644 index 00000000000..520272cce9d --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_min.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELTMIN_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTMIN_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTMIN_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTMIN_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTMIN_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTMIN_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELTMIN_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELTMIN_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + + +/**************************** xaiEltwiseMin3D ********************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise MIN */ +/* operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseMin3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = MIN2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = MIN2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = MIN2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = MIN2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c new file mode 100644 index 00000000000..432f4b80f96 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_mul_S32.c @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" +#if XCHAL_HAVE_VISION //build only on VISION dsps +/******************************** eltwiseMul_BroadCastDims1_j1 ********************************/ +/* Description : Optimized implementation of Broadcast Elementwise Multiplication */ +/* functionality across first dimension. */ +/* Inputs : inTile1, inTile2, param, pitch values */ +/* Outputs : XI Error Code */ +/* InOuts : Both InTiles and outTile is signed 32bit */ +/* Assumptions : While performing element wise multiplication of two input tiles, edge */ +/* data is ignored */ +/**********************************************************************************************/ +static _XAI_INLINE_ void eltwiseMulS32_BroadCastDims1_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile, + int32_t inTile1Pitch0, + int32_t inTile2Pitch0, + int32_t inTile1Pitch1, + int32_t inTile2Pitch1, + int32_t inTile1Pitch2, + int32_t inTile2Pitch2) +{ + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + int32_t *pInput1 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile1); + int32_t *pInput2 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile2); + int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* loop variables */ + int32_t x, y, z; + + /* input and output pointers */ + int32_t *restrict outPtr1; + int32_t *restrict inp1Ptr; + int32_t *restrict inp2Ptr; + + int32_t *restrict outPtr_z; + int32_t *restrict inp1Ptr_z; + int32_t *restrict inp2Ptr_z; + + // Outer Most Loop Pitch Variables + int32_t oOutPitch = outTilePitch2; + int32_t oIn1Pitch = inTile1Pitch2; + int32_t oIn2Pitch = inTile2Pitch2; + + // Middle Loop Pitch Variables + int32_t mOutPitch = outTilePitch1; + int32_t mIn1Pitch = inTile1Pitch1; + int32_t mIn2Pitch = inTile2Pitch1; + + int32_t innerMostLoopCnt = dim1SizeOut; + int32_t middleLoopCnt = dim2SizeOut; + int32_t outerMostLoopCnt = dim3SizeOut; + + if (((inTile2Pitch1 == 0) && (inTile2Pitch2 == 0) && \ + (dim2SizeOut * inTile1Pitch1 == inTile1Pitch2) && (dim1SizeOut == inTile1Pitch1) && \ + (dim2SizeOut * outTilePitch1 == outTilePitch2) && (dim1SizeOut == outTilePitch1)) || \ + ((inTile1Pitch1 == 0) && (inTile1Pitch2 == 0) && \ + (dim2SizeOut * inTile2Pitch1 == inTile2Pitch2) && (dim1SizeOut == inTile2Pitch1) && \ + (dim2SizeOut * outTilePitch1 == outTilePitch2) && (dim1SizeOut == outTilePitch1))) + { + innerMostLoopCnt = dim1SizeOut * dim2SizeOut * dim3SizeOut; + middleLoopCnt = 1; + outerMostLoopCnt = 1; + + /* Middle Loop Pitch Variables */ + mIn1Pitch = 0; + mIn2Pitch = 0; + mOutPitch = 0; + + /* Outer Most Loop Pitch Variables */ + oOutPitch = 0; + oIn1Pitch = 0; + oIn2Pitch = 0; + } + else if ((inTile2Pitch1 == 0 && dim1SizeOut == inTile1Pitch1 && dim1SizeOut == outTilePitch1) || \ + (inTile1Pitch1 == 0 && dim1SizeOut == inTile2Pitch1 && dim1SizeOut == outTilePitch1)) + { + innerMostLoopCnt = dim1SizeOut * dim2SizeOut; + middleLoopCnt = dim3SizeOut; + outerMostLoopCnt = 1; + + /* Middle Loop Pitch Variables */ + mOutPitch = outTilePitch2; + mIn1Pitch = inTile1Pitch2; + mIn2Pitch = inTile2Pitch2; + + /* Outer Most Loop Pitch Variables */ + oOutPitch = 0; + oIn1Pitch = 0; + oIn2Pitch = 0; + } + +#if defined(IVP_MULN_2X32) || (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5 || (XCHAL_HAVE_BBENEP == 1)) /*Auto vectorization is done only if S32 mul ISA is available*/ +/*Adding KQ8 conditionalization also, as KQ8 doesn't have S32 mul support direct or indirect. Therefore, for KQ8 Auto vec attempt shall fail and plain scalar C code shall be used.*/ + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + // This loop process dim1, dim2, dim3 in the same order from innermost + for (z = 0; z < outerMostLoopCnt; z++) + { + outPtr_z = (int32_t *) (pOutput + z * oOutPitch); + inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch); + inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch); + + for (y = 0; y < middleLoopCnt; y++) + { + outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch); + inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch); + inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch); + + /* Load Input 1 */ + int32_t InData1 = inp1Ptr[0]; + + for (x = 0; x < innerMostLoopCnt; x++) + { + int32_t InData2 = inp2Ptr[x]; + outPtr1[x] = InData1 * InData2; + } + } + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + // This loop process dim1, dim2, dim3 in the same order from innermost + for (z = 0; z < outerMostLoopCnt; z++) + { + outPtr_z = (int32_t *) (pOutput + z * oOutPitch); + inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch); + inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch); + + for (y = 0; y < middleLoopCnt; y++) + { + outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch); + inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch); + inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch); + + /* Load Input 2 */ + int32_t InData2 = inp2Ptr[0]; + + for (x = 0; x < innerMostLoopCnt; x++) + { + int32_t InData1 = inp1Ptr[x]; + outPtr1[x] = InData1 * InData2; + } + } + } + } +#else + xb_vecN_2x32v * restrict pvecIn1; + xb_vecN_2x32v * restrict pvecIn2; + xb_vecN_2x32v * restrict pvecOut; + + xb_vecN_2x32v vecInData1; /* 1st input tile */ + xb_vecN_2x32v vecInData2; /* 2nd input tile*/ + + const int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH >> 1; + valign vaOutData = IVP_ZALIGN(); + + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + // This loop process dim1, dim2, dim3 in the same order from innermost + for (z = 0; z < outerMostLoopCnt; z++) + { + outPtr_z = (int32_t *) (pOutput + z * oOutPitch); + inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch); + inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch); + + for (y = 0; y < middleLoopCnt; y++) + { + outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch); + inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch); + inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch); + + /* Vector and pointer of Input 2 and output to load and store values */ + pvecIn2 = (xb_vecN_2x32v *) (inp2Ptr); + valign vaInData2 = IVP_LAN_2X32_PP(pvecIn2); + + pvecOut = (xb_vecN_2x32v *) (outPtr1); + + /* Load Input 1 */ + vecInData1 = (xb_vecN_2x32v) (inp1Ptr[0]); + for (x = 0; x < innerMostLoopCnt; x += vectorizationWidth) + { + /* Vector and pointer of Input 2 and output to load and store values */ + IVP_LAVN_2X32_XP(vecInData2, vaInData2, pvecIn2, (innerMostLoopCnt - x) * 4); + + /* populate wide vectors with product of inputs */ + xb_vecN_2x64w wvecAcc; + wvecAcc = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1); + IVP_MULAHN_2X16X32_1(wvecAcc, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1); + + /* truncate the multiply result in wide vector into 32 bit format*/ + xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(wvecAcc); + + IVP_SAVN_2X32_XP(vecOutData, vaOutData, pvecOut, (innerMostLoopCnt - x) * 4); + } + IVP_SAPOSN_2X32_FP(vaOutData, pvecOut); + } + } + } + else if (inTile2Pitch0 == 0) + { + // This loop process dim1, dim2, dim3 in the same order from innermost + for (z = 0; z < outerMostLoopCnt; z++) + { + outPtr_z = (int32_t *) (pOutput + z * oOutPitch); + inp1Ptr_z = (int32_t *) (pInput1 + z * oIn1Pitch); + inp2Ptr_z = (int32_t *) (pInput2 + z * oIn2Pitch); + + for (y = 0; y < middleLoopCnt; y++) + { + outPtr1 = (int32_t *) (outPtr_z + y * mOutPitch); + inp1Ptr = (int32_t *) (inp1Ptr_z + y * mIn1Pitch); + inp2Ptr = (int32_t *) (inp2Ptr_z + y * mIn2Pitch); + + /* Vector and pointer of Input 1 and output to load and store values */ + pvecIn1 = (xb_vecN_2x32v *) (inp1Ptr); + valign vaInData1 = IVP_LAN_2X32_PP(pvecIn1); + + pvecOut = (xb_vecN_2x32v *) (outPtr1); + + /* Load Input 2 */ + vecInData2 = (xb_vecN_2x32v) (inp2Ptr[0]); + for (x = 0; x < innerMostLoopCnt; x += vectorizationWidth) + { + /* load input data from 2nd tile, input data pointer is post incremented by varlen by the load instruction */ + IVP_LAVN_2X32_XP(vecInData1, vaInData1, pvecIn1, (innerMostLoopCnt - x) * 4); + + /* populate wide vectors with product of inputs */ + xb_vecN_2x64w wvecAcc; + wvecAcc = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1); + IVP_MULAHN_2X16X32_1(wvecAcc, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1); + + /* truncate the multiply result in wide vector into 32 bit format*/ + xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(wvecAcc); + + IVP_SAVN_2X32_XP(vecOutData, vaOutData, pvecOut, (innerMostLoopCnt - x) * 4); + } + IVP_SAPOSN_2X32_FP(vaOutData, pvecOut); + } + } + } +#endif +} + +/**************************** xaiEltwiseMul3D ********************************************/ +/* Description : auto-vectorizable implementation of element-wise S32 multiplication */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseMul3D_S32_AV(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + XAI_CHECK_TILE3D_S32(inTile1); + XAI_CHECK_TILE3D_S32(inTile2); + XAI_CHECK_TILE3D_S32(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t inTile1dim1Size = XAI_TILE3D_GET_DIM1(inTile1); + const int32_t inTile2dim1Size = XAI_TILE3D_GET_DIM1(inTile2); + const int32_t inTile1dim2Size = XAI_TILE3D_GET_DIM2(inTile1); + const int32_t inTile2dim2Size = XAI_TILE3D_GET_DIM2(inTile2); + const int32_t inTile1dim3Size = XAI_TILE3D_GET_DIM3(inTile1); + const int32_t inTile2dim3Size = XAI_TILE3D_GET_DIM3(inTile2); + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + int32_t *pInput1 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile1); + int32_t *pInput2 = (int32_t *) XAI_TILE3D_GET_DATA_PTR(inTile2); + int32_t *pOutput = (int32_t *) XAI_TILE3D_GET_DATA_PTR(outTile); + + /* broadcast flag is set in case of dimension sizes mismatch of inTile1 and inTile2 */ + /* If broadcast flag is set, only the generalized variant is used, even if edges are absent */ + int32_t bcastFlag = 0; + if (!((inTile1dim1Size == inTile2dim1Size) && (inTile1dim2Size == inTile2dim2Size) && (inTile1dim3Size == inTile2dim3Size))) + { + bcastFlag = 1; + } + + int32_t is_2D = ((outTilePitch1 == dim1SizeOut) && (XAI_TILE3D_GET_DIM1_PITCH(inTile1) == dim1SizeOut) && (XAI_TILE3D_GET_DIM1_PITCH(inTile2) == dim1SizeOut)) ? 1 : 0; + int32_t is_1D = ((outTilePitch2 == (dim1SizeOut * dim2SizeOut)) && (XAI_TILE3D_GET_DIM2_PITCH(inTile1) == (dim1SizeOut * dim2SizeOut)) && (XAI_TILE3D_GET_DIM2_PITCH(inTile2) == (dim1SizeOut * dim2SizeOut))) ? 1 : 0; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + if ((inTile1dim1Size == 1 || inTile2dim1Size == 1) && (!(inTile1dim1Size == inTile2dim1Size))) + { + eltwiseMulS32_BroadCastDims1_AV(inTile1, inTile2, outTile, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + } + else + { +#if defined(IVP_MULN_2X32) || (XCHAL_HAVE_HIFI1 || XCHAL_HAVE_HIFI3Z || XCHAL_HAVE_HIFI4 || XCHAL_HAVE_HIFI5 || (XCHAL_HAVE_BBENEP == 1)) /*Auto vectorization is done only if S32 mul ISA is available*/ +/*Adding KQ8 conditionalization also, as KQ8 doesn't have S32 mul support direct or indirect. Therefore, for KQ8 Auto vec attempt shall fail and plain scalar C code shall be used.*/ + int32_t *__restrict pIn1; + int32_t *__restrict pIn2; + int32_t *__restrict pOut; + + /* Overall design approach is split in 2 sections depending on the optimal + * tile sizes. When the edge length along dimension1 is zero, loops across + * dimension1 and dimension2 can be merged. + */ + + /* check for optimal tile size i.e edge length along dimension1 is zero */ + if (is_2D && (!bcastFlag)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3SizeOut; + int32_t maxLoopCount = dim1SizeOut * dim2SizeOut; + + /* Updated Loop count based on tile dimension configuration */ + if (is_1D) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + dim3MaxLoopCount = 1; /* Update max loop counter */ + maxLoopCount *= dim3SizeOut; + } + for (int j = 0; j < dim3MaxLoopCount; j++) + { + pIn1 = pInput1 + j * inTile1Pitch2; + pIn2 = pInput2 + j * inTile2Pitch2; + pOut = pOutput + j * outTilePitch2; + for(int i = 0; i < maxLoopCount; i++) + { + pOut[i] = (int32_t)(pIn1[i] * pIn2[i]); + } + } + } + else + { + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + int32_t* temp1 = pInput1 + z * inTile1Pitch2; + int32_t* temp2 = pInput2 + z * inTile2Pitch2; + int32_t* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + int32_t InData1, InData2; + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = (int32_t) (InData1 * InData2); + } + } + } + } +#else + /* Following code is written for P6/P1 as they don't support S32 MUL. As mentioned, it is used when 32b MUL ISA is not available. */ + /* However, P1 has a proto defined for S32 MUL, internally using 32x16 MUL only. */ + /* Therefore for P1, the above scalar code shall be used which shall be not auto vectorized, */ + /* as compiler cannot find a direct 32b MUL ISA in P1. */ + /* Therefore, P1 shall give a low performance for this API. */ + + /* input and output pointers */ + xb_vecN_2x32v * restrict pvecIn1; + xb_vecN_2x32v * restrict pvecIn2; + xb_vecN_2x32v * restrict pdvecOut; + + /* loop variables */ + int32_t x, y, z; + + int32_t vectorizationWidth = XCHAL_IVPN_SIMD_WIDTH >> 1; + + valign vaOutData = IVP_ZALIGN(); + + /* Overall design approach is split in 2 sections depending on the optimal + * tile sizes. When the edge length along dimension1 is zero, loops across + * dimension1 and dimension2 can be merged. + */ + + /* check for optimal tile size i.e edge length along dimension1 is zero */ + if (is_2D && (!bcastFlag)) + { + /******************************************************************************/ + /* Data exist in contiguous memory location with respect to first dimension */ + /******************************************************************************/ + + /* Initialize max loop counter */ + int32_t dim3MaxLoopCount = dim3SizeOut; + int32_t maxLoopCount = dim1SizeOut * dim2SizeOut; + + /* Updated Loop count based on tile dimension configuration */ + if (is_1D) + { + /**********************************************************************/ + /* Data exist in contiguous memory location with respect to first and */ + /* second dimension */ + /**********************************************************************/ + dim3MaxLoopCount = 1; /* Update max loop counter */ + maxLoopCount *= dim3SizeOut; + } + for (z = 0; z < dim3MaxLoopCount; z++) + { + pvecIn1 = (xb_vecN_2x32v *) &pInput1[z * inTile1Pitch2]; + valign vaInData1 = IVP_LAN_2X32_PP (pvecIn1); + + pvecIn2 = (xb_vecN_2x32v *) &pInput2[z * inTile2Pitch2]; + valign vaInData2 = IVP_LAN_2X32_PP (pvecIn2); + + pdvecOut = (xb_vecN_2x32v *) &pOutput[z * outTilePitch2]; + + /* loop across dimension1, dimension2 and dimension3 is combined */ + for (x = 0; x <= maxLoopCount - vectorizationWidth; x += vectorizationWidth) + { + /* input data vectors */ + xb_vecN_2x32v vecInData1; /* first input tile */ + xb_vecN_2x32v vecInData2; /* 2nd input tile*/ + + /* load input data from 1st tile, input data pointer is post incremented + * implicitly by SIMD/2 by the load instruction */ + IVP_LAN_2X32_IP(vecInData1, vaInData1, pvecIn1); + + /* load input data from 2nd tile, input data pointer is post incremented + * implicitly by SIMD/2 by the load instruction */ + IVP_LAN_2X32_IP(vecInData2, vaInData2, pvecIn2); + + /* populate wide vectors with product of inputs */ + xb_vecN_2x64w acc1; + acc1 = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1); + IVP_MULAHN_2X16X32_1(acc1, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1); + + /* truncate the multiply result in wide vector into 32 bit format*/ + xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(acc1); + + IVP_SAVN_2X32_XP(vecOutData, vaOutData, pdvecOut, vectorizationWidth * 4); + } /* end of for (x = 0; x <= maxLoopCount - vectorizationWidth; x += vectorizationWidth) */ + + if (x < maxLoopCount) + { + /* input data vectors */ + xb_vecN_2x32v vecInData1; /* 1st input tile */ + xb_vecN_2x32v vecInData2; /* 2nd input tile*/ + + /* variable store count for output */ + int32_t varLen = (maxLoopCount - x) * 4; + + /* load input data from 1st tile, input data pointer is post incremented by varLen, by the load instruction */ + IVP_LAVN_2X32_XP(vecInData1, vaInData1, pvecIn1, varLen); + + /* load input data from 2nd tile, input data pointer is post incremented by varLen, by the load instruction */ + IVP_LAVN_2X32_XP(vecInData2, vaInData2, pvecIn2, varLen); + + /* populate wide vectors with product of inputs */ + xb_vecN_2x64w acc1; + acc1 = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1); + IVP_MULAHN_2X16X32_1(acc1, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1); + + /* truncate the multiply result in wide vector into 32 bit format*/ + xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(acc1); + IVP_SAVN_2X32_XP(vecOutData, vaOutData, pdvecOut, varLen); + } /*end of if (x < maxLoopCount)*/ + IVP_SAPOSN_2X32_FP(vaOutData, pdvecOut); + } /* end of for (z = 0; z < dim3MaxLoopCount; z++) */ + } /* end of if ((inTile1Pitch1 == dim1SizeOut) && (inTile2Pitch1 == dim1SizeOut) && (outTilePitch1 == dim1SizeOut)) */ + /* Handle cases with edges and/or broadcast along dim2/3 */ + else + { + for (x = 0; x < dim1SizeOut; x += vectorizationWidth) /* along 1st dimension */ + { + /* variable store count for output */ + int32_t varLen = (dim1SizeOut - x) * 4; + + for (z = 0; z < dim3SizeOut; z++) /* along 3rd dimension */ + { + int32_t * pIn1 = &pInput1[z * inTile1Pitch2 + x]; + + int32_t * pIn2 = &pInput2[z * inTile2Pitch2 + x]; + /* pointer for 1st tile */ + pvecIn1 = (xb_vecN_2x32v *) pIn1; + + /* pointer for 2nd tile */ + pvecIn2 = (xb_vecN_2x32v *) pIn2; + + int32_t * pOut = &pOutput[z * outTilePitch2 + x]; + + for (y = 0; y < dim2SizeOut; y++) /* along 2nd dimension */ + { + /* input data vectors */ + /* 1st input tile */ + xb_vecN_2x32v vecInData1; + + /* 2nd input tile */ + xb_vecN_2x32v vecInData2; + + /* load input data from 1st tile */ + valign vaInData1 = IVP_LAN_2X32_PP(pvecIn1); + + IVP_LAN_2X32_XP(vecInData1, vaInData1, pvecIn1, inTile1Pitch1 * 4); + + /* load input data from 2nd tile */ + valign vaInData2 = IVP_LAN_2X32_PP (pvecIn2); + + IVP_LAN_2X32_XP(vecInData2, vaInData2, pvecIn2, inTile2Pitch1 * 4); + + /* populate wide vectors with product of inputs */ + xb_vecN_2x64w acc1; + acc1 = IVP_MULUSN_2X16X32_0(IVP_MOVNX16U_FROMNX16(IVP_MOVNX16_FROMN_2X32(vecInData2)), vecInData1); + IVP_MULAHN_2X16X32_1(acc1, IVP_MOVNX16_FROMN_2X32(vecInData2), vecInData1); + + pdvecOut = (xb_vecN_2x32v *) pOut; + /* truncate the multiply result in wide vector into 32 bit format*/ + xb_vecN_2x32v vecOutData = IVP_PACKLN_2X64W(acc1); + IVP_SAVN_2X32_XP(vecOutData, vaOutData, pdvecOut, varLen); + + IVP_SAPOSN_2X32_FP(vaOutData, pdvecOut); + pOut += outTilePitch1; + } /* end of for (y = 0; y < dim2SizeOut; y++) loop */ + } /* end of for (z = 0; z < dim3SizeOut; z++) loop */ + } /* end of for (x = 0; x < dim1SizeOut; x += vectorizationWidth) loop */ + } /* end of else */ +#endif + } + return(XAI_ERROR_STATUS()); +} +#endif //#if XCHAL_HAVE_VISION diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c new file mode 100644 index 00000000000..511ee3b4c77 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTOR_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_or.h" +#undef ELTOR_DATA_TYPE + +#define ELTOR_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_or.h" +#undef ELTOR_DATA_TYPE + +#define ELTOR_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_or.h" +#undef ELTOR_DATA_TYPE + +#define ELTOR_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_or.h" +#undef ELTOR_DATA_TYPE + +#define ELTOR_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_or.h" +#undef ELTOR_DATA_TYPE + +#define ELTOR_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_or.h" +#undef ELTOR_DATA_TYPE + + +/**************************** xaiEltwiseOr3D_AV ******************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise and */ +/* bitwise OR operator */ +/* Calls one of the xaiEltwiseOr3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseOr3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseOr3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseOr3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseOr3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseOr3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseOr3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseOr3D_U32_AV(inTile1, inTile2, outTile)); + } + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h new file mode 100644 index 00000000000..a88b0bc7fb8 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_or.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#endif + +#if ELTOR_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTOR_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTOR_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTOR_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTOR_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTOR_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t +#endif + +#define OR2(a, b) a | b + + +/**************************** xaiEltwiseOr3D *********************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise and bitwise */ +/* OR operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, and U32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseOr3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = OR2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = OR2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = OR2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = OR2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c new file mode 100644 index 00000000000..e2049e66f4f --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTSUB_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE + +#define ELTSUB_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE + +#define ELTSUB_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE + +#define ELTSUB_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE + +#define ELTSUB_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE + +#define ELTSUB_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#define ELTSUB_DATA_TYPE FLOAT16BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#define ELTSUB_DATA_TYPE FLOAT32BIT +#include "cnn_eltwise_sub.h" +#undef ELTSUB_DATA_TYPE +#endif + + +/**************************** xaiEltwiseSub3D_AV *****************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise subtraction */ +/* Calls one of the xaiEltwiseSub3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseSub3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseSub3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseSub3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseSub3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseSub3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseSub3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseSub3D_U32_AV(inTile1, inTile2, outTile)); + } + +#if XCHAL_HAVE_VISION_HP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F16)) + { + return(xaiEltwiseSub3D_F16_AV(inTile1, inTile2, outTile)); + } +#endif + +#if XCHAL_HAVE_VISION_SP_VFPU == 1 + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_F32)) + { + return(xaiEltwiseSub3D_F32_AV(inTile1, inTile2, outTile)); + } +#endif + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h new file mode 100644 index 00000000000..26aca7d8b13 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_sub.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#define FLOAT16BIT 7 +#define FLOAT32BIT 8 +#endif + +#if ELTSUB_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTSUB_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTSUB_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTSUB_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTSUB_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTSUB_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t + +#elif ELTSUB_DATA_TYPE == FLOAT16BIT +#if XCHAL_HAVE_VISION_HP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F16 +#define MORPH_IDT_SCALAR xb_f16 +#endif + +#elif ELTSUB_DATA_TYPE == FLOAT32BIT +#if XCHAL_HAVE_VISION_SP_VFPU == 1 +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _F32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_F32 +#define MORPH_IDT_SCALAR float +#endif +#endif + +#define SUB2(a, b) a - b + + +/**************************** xaiEltwiseSub3D ********************************************/ +/* Description : auto-vectorizable implementation of Broadcast element-wise subtraction */ +/* Based on MORPH implementation eight variants are generated for */ +/* S8, U8, S16, U16, S32, U32, F16 and F32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseSub3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = SUB2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = SUB2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = SUB2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = SUB2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c new file mode 100644 index 00000000000..f1e97a03418 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_cnn_common.h" + + +#define ELTXOR_DATA_TYPE SIGNED8BIT +#include "cnn_eltwise_xor.h" +#undef ELTXOR_DATA_TYPE + +#define ELTXOR_DATA_TYPE UNSIGNED8BIT +#include "cnn_eltwise_xor.h" +#undef ELTXOR_DATA_TYPE + +#define ELTXOR_DATA_TYPE SIGNED16BIT +#include "cnn_eltwise_xor.h" +#undef ELTXOR_DATA_TYPE + +#define ELTXOR_DATA_TYPE UNSIGNED16BIT +#include "cnn_eltwise_xor.h" +#undef ELTXOR_DATA_TYPE + +#define ELTXOR_DATA_TYPE SIGNED32BIT +#include "cnn_eltwise_xor.h" +#undef ELTXOR_DATA_TYPE + +#define ELTXOR_DATA_TYPE UNSIGNED32BIT +#include "cnn_eltwise_xor.h" +#undef ELTXOR_DATA_TYPE + + +/**************************** xaiEltwiseXor3D_AV *****************************************/ +/* Description : General API for auto-vectorizable Broadcast element-wise and */ +/* bitwise XOR operator */ +/* Calls one of the xaiEltwiseXor3D_AV functions based on the data type */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE xaiEltwiseXor3D_AV(const xai_pTile3D inTile1, + const xai_pTile3D inTile2, + xai_pTile3D outTile) +{ + if (!inTile1 || !inTile2 || !outTile) + { + return(XAI_ERR_NULLARG); + } + + if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S8)) + { + return(xaiEltwiseXor3D_S8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U8)) + { + return(xaiEltwiseXor3D_U8_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S16)) + { + return(xaiEltwiseXor3D_S16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U16)) + { + return(xaiEltwiseXor3D_U16_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_S32)) + { + return(xaiEltwiseXor3D_S32_AV(inTile1, inTile2, outTile)); + } + else if (XAI_TILE3D_CHECK_TYPE(inTile1, XAI_U32)) + { + return(xaiEltwiseXor3D_U32_AV(inTile1, inTile2, outTile)); + } + + return(XAI_ERR_OK); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h new file mode 100644 index 00000000000..28cd63389f0 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/cnn_eltwise_xor.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ +#include "xai_cnn_common.h" + + +#ifndef SIGNED8BIT +#define SIGNED8BIT 1 +#define UNSIGNED8BIT 2 +#define SIGNED16BIT 3 +#define UNSIGNED16BIT 4 +#define SIGNED32BIT 5 +#define UNSIGNED32BIT 6 +#endif + +#if ELTXOR_DATA_TYPE == SIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S8 +#define MORPH_IDT_SCALAR int8_t + +#elif ELTXOR_DATA_TYPE == UNSIGNED8BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U8_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U8 +#define MORPH_IDT_SCALAR uint8_t + +#elif ELTXOR_DATA_TYPE == SIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S16 +#define MORPH_IDT_SCALAR int16_t + +#elif ELTXOR_DATA_TYPE == UNSIGNED16BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U16_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U16 +#define MORPH_IDT_SCALAR uint16_t + +#elif ELTXOR_DATA_TYPE == SIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _S32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_S32 +#define MORPH_IDT_SCALAR int32_t + +#elif ELTXOR_DATA_TYPE == UNSIGNED32BIT +#undef MAKE_NAME +#undef MORPH_IDT_CHECK +#undef MORPH_IDT_SCALAR +#define MAKE_NAME(name) name ## _U32_AV +#define MORPH_IDT_CHECK XAI_CHECK_TILE3D_U32 +#define MORPH_IDT_SCALAR uint32_t +#endif + +#define XOR2(a, b) a ^ b + + +/**************************** xaiEltwiseXor3D *********************************************/ +/* Description : auto-vectorizable implementation of Broadcast elementWise and bitwise */ +/* XOR operator, Based on MORPH implementation eight variants are */ +/* generated for S8, U8, S16, U16, S32, and U32 data types */ +/* Inputs : inTile1, inTile2 */ +/* Outputs : XI Error Code */ +/* InOuts : outTile */ +/*****************************************************************************************/ + +XAI_ERR_TYPE MAKE_NAME (xaiEltwiseXor3D)(const xai_pTile3D inTile1, const xai_pTile3D inTile2, xai_pTile3D outTile) +{ + /* Error Checks */ + XAI_ERROR_CHECKS() + { + MORPH_IDT_CHECK(inTile1); + MORPH_IDT_CHECK(inTile2); + MORPH_IDT_CHECK(outTile); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile1); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(inTile2); + XAI_CHECK_TILE3D_IN_DRAM_BOUNDARY(outTile); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(inTile2), + XAI_ERR_BADARG, "\nData Order of InputTile1 = %d and InputTile2 = %d\nData Order of InputTile1 and InputTile2 should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(inTile2)); + XAI_CHECK_ERROR(XAI_TILE3D_GET_DATA_ORDER(inTile1) == XAI_TILE3D_GET_DATA_ORDER(outTile), + XAI_ERR_BADARG, "\nData Order of InputTile = %d and OutputTile = %d\nData Order of InputTile and OutputTile should be same", \ + XAI_TILE3D_GET_DATA_ORDER(inTile1), XAI_TILE3D_GET_DATA_ORDER(outTile)); + XAI_CHECK_TILE3D_BCAST_DIMENSIONS(inTile1, inTile2, outTile, 1, 1); + } + + /* Get Tile Parameters */ + const int32_t dim1SizeOut = XAI_TILE3D_GET_DIM1(outTile); + const int32_t dim2SizeOut = XAI_TILE3D_GET_DIM2(outTile); + const int32_t dim3SizeOut = XAI_TILE3D_GET_DIM3(outTile); + const int32_t outTilePitch1 = XAI_TILE3D_GET_DIM1_PITCH(outTile); + const int32_t outTilePitch2 = XAI_TILE3D_GET_DIM2_PITCH(outTile); + + /* Get Data Pointers */ + MORPH_IDT_SCALAR *pInput1 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile1); + MORPH_IDT_SCALAR *pInput2 = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(inTile2); + MORPH_IDT_SCALAR *pOutput = (MORPH_IDT_SCALAR *) XAI_TILE3D_GET_DATA_PTR(outTile); + + MORPH_IDT_SCALAR *__restrict pIn1; + MORPH_IDT_SCALAR *__restrict pIn2; + MORPH_IDT_SCALAR *__restrict pOut; + + /* Get Pitch appropriate for elementwise broadcast operations */ + XAI_TILE3D_GET_BCAST123_PITCH(inTile1, inTile2, inTile1Pitch0, inTile2Pitch0, inTile1Pitch1, \ + inTile2Pitch1, inTile1Pitch2, inTile2Pitch2); + + /* no Broadcast */ + if (inTile1Pitch2 == inTile2Pitch2 && inTile1Pitch2 == outTilePitch2) + { + int dimsCount = dim1SizeOut * dim2SizeOut * dim3SizeOut; + pIn1 = pInput1; + pIn2 = pInput2; + pOut = pOutput; + + for (int i = 0; i < dimsCount; i++) + { + pOut[i] = XOR2(pIn1[i], pIn2[i]); + } + } + else + { + /* + inTile1Pitch0 == 0 : Tile1 Dimension 1 broadcasting + inTile1Pitch1 == 0 : Tile1 Dimension 2 broadcasting + inTile1Pitch2 == 0 : Tile1 Dimension 3 broadcasting + inTile2Pitch0 == 0 : Tile2 Dimension 1 broadcasting + inTile2Pitch1 == 0 : Tile2 Dimension 2 broadcasting + inTile2Pitch2 == 0 : Tile2 Dimension 3 broadcasting + */ + int32_t y, z, idx; + + for (z = 0; z < dim3SizeOut; z++) + { + MORPH_IDT_SCALAR* temp1 = pInput1 + z * inTile1Pitch2; + MORPH_IDT_SCALAR* temp2 = pInput2 + z * inTile2Pitch2; + MORPH_IDT_SCALAR* temp3 = pOutput + z * outTilePitch2; + + for (y = 0; y < dim2SizeOut; y++) + { + pIn1 = (temp1 + y * inTile1Pitch1); + pIn2 = (temp2 + y * inTile2Pitch1); + pOut = (temp3 + y * outTilePitch1); + + MORPH_IDT_SCALAR InData1, InData2; + /* Tile1 Dimension 1 broadcasting */ + if (inTile1Pitch0 == 0) + { + InData1 = pIn1[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData2 = pIn2[idx]; + pOut[idx] = XOR2(InData1, InData2); + } + } + /* Tile2 Dimension 1 broadcasting */ + else if (inTile2Pitch0 == 0) + { + InData2 = pIn2[0]; + /* reduced one load from core loop */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + pOut[idx] = XOR2(InData1, InData2); + } + } + else + { + /* broadcast in dims 1 or 2 in Tile1 or TIle2 */ + for (idx = 0; idx < dim1SizeOut; idx++) + { + InData1 = pIn1[idx]; + InData2 = pIn2[idx]; + pOut[idx] = XOR2(InData1, InData2); + } + } + } + } + } + + return(XAI_ERROR_STATUS()); +} + diff --git a/backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c b/backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c new file mode 100644 index 00000000000..87665867c54 --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/xai_buildinfo.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_core.h" + +// XI library build configuration - Release/Debug or other +char XAI_BUILD_CONFIGURATION[] = XAI_AUX_STR(_XAI_BUILD_CONFIGURATION_); + +// XTRENSA tools version +char XAI_BUILD_TOOLS_VERSION[] = XAI_AUX_STR(_XAI_BUILD_TOOLS_VERSION_); + +// target core name and hardware name +char XAI_BUILD_CORE_ID[] = +#if defined(XCHAL_CORE_ID) && defined(XCHAL_HW_VERSION_NAME) + XCHAL_CORE_ID " (" XCHAL_HW_VERSION_NAME ")" +#elif defined(XCHAL_CORE_ID) + XCHAL_CORE_ID +#else + "CSTUB (x86)" +#endif +; + +// error level +char XAI_BUILD_ERROR_LEVEL[] = +#if XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR + "PRINT_AND_CONTINUE_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_PRINT_AND_CONTINUE_ON_ERROR) ")" +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_PRINT_ON_ERROR + "PRINT_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_PRINT_ON_ERROR) ")" +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_CONTINUE_ON_ERROR + "CONTINUE_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_CONTINUE_ON_ERROR) ")" +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_RETURN_ON_ERROR + "RETURN_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_RETURN_ON_ERROR) ")" +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_TERMINATE_ON_ERROR + "TERMINATE_ON_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_TERMINATE_ON_ERROR) ")" +#elif XAI_ERROR_LEVEL == XAI_ERROR_LEVEL_NO_ERROR + "NO_ERROR (" XAI_AUX_STR(XAI_ERROR_LEVEL_NO_ERROR) ")" +#else + XAI_AUX_STR(XAI_ERROR_LEVEL) +#endif +; + +// library features +char XAI_BUILD_FEATURES_STR[] = "" +#if __XTENSA__ && XAI_EMULATE_LOCAL_RAM && XAI_ERROR_LEVEL != XAI_ERROR_LEVEL_NO_ERROR + "DRAM_CHECK " +#endif +; diff --git a/backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c b/backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c new file mode 100644 index 00000000000..be08daac86f --- /dev/null +++ b/backends/cadence/vision/third-party/libxai_common/src/xai_errstr.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. + * These coded instructions, statements, and computer programs are the + * copyrighted works and confidential proprietary information of + * Cadence Design Systems Inc. They may be adapted and modified by bona fide + * purchasers for internal use, but neither the original nor any adapted + * or modified version may be disclosed or distributed to third parties + * in any manner, medium, or form, in whole or in part, without the prior + * written consent of Cadence Design Systems Inc. This software and its + * derivatives are to be executed solely on products incorporating a Cadence + * Design Systems processor. + */ + +#include "xai_core.h" + +const char* xaiErrStr(XAI_ERR_TYPE code) +{ + switch (code) + { + case XAI_ERR_OK: return("No error"); + case XAI_ERR_IALIGNMENT: return("Input alignment requirements are not satisfied"); + case XAI_ERR_OALIGNMENT: return("Output alignment requirements are not satisfied"); + case XAI_ERR_MALIGNMENT: return("Same modulo alignment requirement is not satisfied"); + case XAI_ERR_BADARG: return("Function arguments are somehow invalid"); + case XAI_ERR_MEMLOCAL: return("Tile is not placed in local memory"); + case XAI_ERR_INPLACE: return("Inplace operation is not supported"); + case XAI_ERR_EDGE: return("Edge extension size is too small"); + case XAI_ERR_DATASIZE: return("Input/output tile size is too small or too big or otherwise inconsistent"); + case XAI_ERR_TMPSIZE: return("Temporary tile size is too small or otherwise inconsistent"); + case XAI_ERR_KSIZE: return("Filer kernel size is not supported"); + case XAI_ERR_NORM: return("Invalid normalization divisor or shift value"); + case XAI_ERR_COORD: return("Tile coordinates are invalid"); + case XAI_ERR_BADTRANSFORM: return("Transform is singular or otherwise invalid"); + case XAI_ERR_NULLARG: return("One of required arguments is NULL"); + case XAI_ERR_THRESH_INVALID: return("Threshold value is somehow invalid"); + case XAI_ERR_SCALE: return("Provided scale factor is not supported"); + case XAI_ERR_OVERFLOW: return("Tile size can lead to sum overflow"); + case XAI_ERR_NOTIMPLEMENTED: return("The requested functionality is absent in current version of XI Library"); + case XAI_ERR_CHANNEL_INVALID: return("Channel number is somehow invalid"); + case XAI_ERR_DATATYPE: return("Argument has invalid data type"); + case XAI_ERR_NO_VARIANT: return("No suitable variant of the function is available"); + case XAI_ERR_CUSTOMACC_PREPARE: return("Preparing custom acc hardware fails"); + case XAI_ERR_CUSTOMACC_EXECUTE: return("Executing ops on custom acc hardware fails"); + case XAI_ERR_CUSTOMACC_REMOVE: return("Removing a network for custom acc hardware fails"); + + case XAI_ERR_POOR_DECOMPOSITION: return("Computed transform decomposition can produce visual artifacts"); + case XAI_ERR_OUTOFTILE: return("The arguments or results are out of tile"); + case XAI_ERR_OBJECTLOST: return("Tracked object is lost"); + case XAI_ERR_RANSAC_NOTFOUND: return("Unable to find an appropriate model for RANSAC"); + case XAI_ERR_REPLAY: return("Repeated function call is required for completion"); + } + ; + return("Unknown error"); +} + From c2a48d2f5d04292d0a3f40c6c6023cdb2e750ad1 Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Fri, 29 May 2026 00:49:33 -0700 Subject: [PATCH 2/7] Reset non-Cadence files to upstream/main --- .ci/docker/common/install_linter.sh | 4 - .github/workflows/lint.yml | 46 --- .github/workflows/mlx.yml | 4 + 1 | 1 + backends/arm/tosa/partitioner.py | 5 +- backends/xnnpack/third-party/XNNPACK | 2 +- backends/xnnpack/third-party/cpuinfo | 2 +- backends/xnnpack/third-party/pthreadpool | 2 +- extension/llm/tokenizers | 2 +- kernels/portable/cpu/op__device_copy.cpp | 154 +++++++++ kernels/portable/functions.yaml | 10 + kernels/test/op__device_copy_test.cpp | 297 ++++++++++++++++++ kernels/test/targets.bzl | 14 +- resnet18_log1.log | 0 shim | 2 +- shim_et/xplat/executorch/codegen/codegen.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 6 + third-party/ao | 2 +- third-party/pocketfft | 2 +- 19 files changed, 495 insertions(+), 61 deletions(-) create mode 100644 1 create mode 100644 kernels/portable/cpu/op__device_copy.cpp create mode 100644 kernels/test/op__device_copy_test.cpp create mode 100644 resnet18_log1.log diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh index 52d2d262685..4a796a72d54 100755 --- a/.ci/docker/common/install_linter.sh +++ b/.ci/docker/common/install_linter.sh @@ -13,7 +13,3 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" # NB: Install all linter dependencies, the caching of lintrunner init could be # done after Executorch becomes public pip_install -r requirements-lintrunner.txt - -# Install google-java-format -curl -L --retry 3 --retry-all-errors https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format -chmod +x /opt/google-java-format diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b26247d2333..b21cc527b8d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -125,49 +125,3 @@ jobs: uses: ./.github/workflows/_link_check.yml with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - - android-java-format: - runs-on: ubuntu-latest - permissions: - contents: read - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - - - uses: actions/setup-java@v4 - with: - distribution: 'temurin' - java-version: '17' - - - name: Check Java formatting - run: | - GOOGLE_JAVA_FORMAT_VERSION="1.24.0" - curl -sSfL "https://github.com/google/google-java-format/releases/download/v${GOOGLE_JAVA_FORMAT_VERSION}/google-java-format-${GOOGLE_JAVA_FORMAT_VERSION}-all-deps.jar" \ - -o /tmp/google-java-format.jar - - FILES_NEEDS_FORMAT=$(find extension/android/executorch_android/src/main/java/org/pytorch/executorch \ - extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm \ - extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations \ - extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch \ - extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench \ - extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench \ - -type f -name "*.java" 2>/dev/null | \ - xargs -r java -jar /tmp/google-java-format.jar -n) - - if [ -n "$FILES_NEEDS_FORMAT" ]; then - echo "Warning: The following files need formatting:" - echo "$FILES_NEEDS_FORMAT" - echo "" - echo "Please use google-java-format from https://github.com/google/google-java-format/releases/" - echo "" - echo "To fix, run one of these commands:" - echo " # Using xargs (recommended):" - echo " find -type f -name '*.java' | xargs google-java-format -i" - echo "" - echo " # Or format specific files:" - echo "$FILES_NEEDS_FORMAT" | while IFS= read -r file; do - echo " google-java-format -i \"$file\"" - done - exit 1 - fi diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index c4be146f862..027101ba7f0 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -47,6 +47,10 @@ jobs: ${CONDA_RUN} pip list + echo "::group::Install Python test requirements" + ${CONDA_RUN} pip install gguf + echo "::endgroup::" + echo "::group::Build test runners" ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 )) echo "::endgroup::" diff --git a/1 b/1 new file mode 100644 index 00000000000..8462b88277b --- /dev/null +++ b/1 @@ -0,0 +1 @@ +usage: list-sessions [-F format] diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py index d93e212c314..37b9cd7cc2a 100644 --- a/backends/arm/tosa/partitioner.py +++ b/backends/arm/tosa/partitioner.py @@ -550,7 +550,10 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: partition_tags = {tag: self.delegation_spec for tag in tags} tag_constant_data(exported_program) - if self.intermediate_path is not None and logger.level <= logging.INFO: + if ( + self.intermediate_path is not None + and logger.getEffectiveLevel() <= logging.INFO + ): intermediate_path = Path(self.intermediate_path) intermediate_path.mkdir(parents=True, exist_ok=True) file_handler = logging.FileHandler( diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index 1adaa7c709d..3131afead79 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit 1adaa7c709d4839d29e1f219cb962b01c9e6a905 +Subproject commit 3131afead790c5c69a9aa12273dfc40399789ad7 diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo index f9a03241f8c..8a9210069b5 160000 --- a/backends/xnnpack/third-party/cpuinfo +++ b/backends/xnnpack/third-party/cpuinfo @@ -1 +1 @@ -Subproject commit f9a03241f8c3d4ed0c9728f5d70bff873d43d4e0 +Subproject commit 8a9210069b5a37dd89ed118a783945502a30a4ae diff --git a/backends/xnnpack/third-party/pthreadpool b/backends/xnnpack/third-party/pthreadpool index a56dcd79c69..c2ba5c50bb5 160000 --- a/backends/xnnpack/third-party/pthreadpool +++ b/backends/xnnpack/third-party/pthreadpool @@ -1 +1 @@ -Subproject commit a56dcd79c699366e7ac6466792c3025883ff7704 +Subproject commit c2ba5c50bb58d1397b693740cf75fad836a0d1bf diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index b642403834a..3aada3fe28c 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a +Subproject commit 3aada3fe28c945d14d5ec62254eb56ccdf10eb11 diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp new file mode 100644 index 00000000000..5e1a51a83be --- /dev/null +++ b/kernels/portable/cpu/op__device_copy.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops. + * + * These ops transfer tensor data between CPU and device memory using + * the DeviceAllocator interface. The device type is inferred from the + * tensor metadata (out.device_type() for H2D, self.device_type() for D2H), + * which was set during AOT serialization by PropagateDevicePass. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = executorch::aten::Tensor; +using DeviceAllocator = executorch::runtime::DeviceAllocator; +using Error = executorch::runtime::Error; + +/** + * Copies tensor data from host (CPU) memory to device memory. + * + * self: source tensor on CPU + * out: destination tensor on device (memory-planned by runtime) + * + * The device type and index are inferred from out's TensorImpl metadata. + */ +Tensor& +_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = out.unsafeGetTensorImpl()->device_type(); + auto device_index = out.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + self.unsafeGetTensorImpl()->device_type() == + executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: source tensor must be on CPU, got device_type=%d", + static_cast(self.unsafeGetTensorImpl()->device_type())); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: destination tensor must be on a non-CPU device"); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = + executorch::runtime::get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_h2d_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_host_to_device( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_h2d_copy: copy_host_to_device failed"); + + return out; +} + +/** + * Copies tensor data from device memory to host (CPU) memory. + * + * self: source tensor on device + * out: destination tensor on CPU (memory-planned by runtime) + * + * The device type and index are inferred from self's TensorImpl metadata. + */ +Tensor& +_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = self.unsafeGetTensorImpl()->device_type(); + auto device_index = self.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: source tensor must be on a non-CPU device"); + + ET_KERNEL_CHECK_MSG( + ctx, + out.unsafeGetTensorImpl()->device_type() == + executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: destination tensor must be on CPU, got device_type=%d", + static_cast(out.unsafeGetTensorImpl()->device_type())); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = + executorch::runtime::get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_d2h_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_device_to_host( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_d2h_copy: copy_device_to_host failed"); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 620d97d050f..ecf62ee3606 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -1045,6 +1045,16 @@ - arg_meta: null kernel_name: torch::executor::zeros_out +- func: et_copy::_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_h2d_copy_out + +- func: et_copy::_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_d2h_copy_out + - func: dim_order_ops::_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/kernels/test/op__device_copy_test.cpp b/kernels/test/op__device_copy_test.cpp new file mode 100644 index 00000000000..d345642bd37 --- /dev/null +++ b/kernels/test/op__device_copy_test.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels. + * + * Uses a MockDeviceAllocator to verify that the kernels correctly call + * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface, + * and that device type is inferred from tensor metadata. + */ + +#include + +#include // Declares the operator +#include +#include +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism; + +namespace { + +class MockDeviceAllocator : public DeviceAllocator { + public: + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = kDefaultAlignment) override { + return Error::NotSupported; + } + + void deallocate(void* ptr, DeviceIndex index) override {} + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + h2d_call_count_++; + last_h2d_nbytes_ = nbytes; + last_h2d_device_index_ = index; + // Actually copy so we can verify data + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + d2h_call_count_++; + last_d2h_nbytes_ = nbytes; + last_d2h_device_index_ = index; + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int h2d_call_count_ = 0; + int d2h_call_count_ = 0; + size_t last_h2d_nbytes_ = 0; + size_t last_d2h_nbytes_ = 0; + DeviceIndex last_h2d_device_index_ = -1; + DeviceIndex last_d2h_device_index_ = -1; +}; + +} // namespace + +static MockDeviceAllocator g_mock_cuda; + +class OpDeviceCopyTest : public OperatorTest { + protected: + Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_h2d_copy_outf(context_, self, out); + } + + Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_d2h_copy_outf(context_, self, out); + } + + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + if (get_device_allocator(DeviceType::CUDA) == nullptr) { + register_device_allocator(&g_mock_cuda); + } + } + + void SetUp() override { + OperatorTest::SetUp(); + g_mock_cuda.h2d_call_count_ = 0; + g_mock_cuda.d2h_call_count_ = 0; + g_mock_cuda.last_h2d_nbytes_ = 0; + g_mock_cuda.last_d2h_nbytes_ = 0; + g_mock_cuda.last_h2d_device_index_ = -1; + g_mock_cuda.last_d2h_device_index_ = -1; + } +}; + +TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) { + // Set up a CPU source tensor with known data. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Set up a CUDA destination tensor (simulated with host memory). + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_h2d_copy_out(src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0); + + // Verify data was copied (mock does a real memcpy). + EXPECT_EQ(dst_data[0], 1.0f); + EXPECT_EQ(dst_data[1], 2.0f); + EXPECT_EQ(dst_data[2], 3.0f); + EXPECT_EQ(dst_data[3], 4.0f); + + // Verify return value is the out tensor. + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) { + // Set up a CUDA source tensor with known data. + float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + // Set up a CPU destination tensor. + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_d2h_copy_out(src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0); + + // Verify data was copied. + EXPECT_EQ(dst_data[0], 5.0f); + EXPECT_EQ(dst_data[1], 6.0f); + EXPECT_EQ(dst_data[2], 7.0f); + EXPECT_EQ(dst_data[3], 8.0f); + + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) { + // Verify device_index is correctly forwarded to the allocator. + float src_data[] = {1.0f}; + float dst_data[] = {0.0f}; + int32_t sizes[] = {1}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Device index = 1 (e.g., cuda:1) + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 1); + Tensor dst(&dst_impl); + + op_h2d_copy_out(src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1); +} + +TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) { + // Test with a 2D tensor [2, 3]. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + int32_t sizes[] = {2, 3}; + uint8_t dim_order[] = {0, 1}; + int32_t strides[] = {3, 1}; + + TensorImpl src_impl( + ScalarType::Float, + 2, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 2, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + op_h2d_copy_out(src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float)); + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(dst_data[i], src_data[i]); + } +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index bc51e336cb8..5212d691c5b 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -1,14 +1,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper", "op_test") -def _common_op_test(name, kernels): +def _common_op_test(name, kernels, deps = []): """ Defines test targets in format of _op__test For ATen kernel testing, let's use portable functions.yaml for tested ops. """ for kernel in kernels: - deps = [":function_header_wrapper_{}".format(kernel)] - op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = deps) + op_deps = [":function_header_wrapper_{}".format(kernel)] + deps + op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = op_deps) def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -177,6 +177,14 @@ def define_common_targets(): _common_op_test("op__clone_dim_order_test", ["aten", "portable"]) _common_op_test("op__conj_physical_test", ["aten", "portable"]) _common_op_test("op__adaptive_avg_pool2d_test", ["aten", "portable"]) + _common_op_test( + "op__device_copy_test", + ["portable"], + deps = [ + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/platform:platform", + ], + ) _common_op_test("op_abs_test", ["aten", "portable"]) _common_op_test("op_acos_test", ["aten", "portable"]) _common_op_test("op_acosh_test", ["aten", "portable"]) diff --git a/resnet18_log1.log b/resnet18_log1.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/shim b/shim index b295819bb0e..cf6a954aae4 160000 --- a/shim +++ b/shim @@ -1 +1 @@ -Subproject commit b295819bb0ec636b4e3359828e05476d2437650a +Subproject commit cf6a954aae4bee7b4515e13475878460115027d1 diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl index 5ffa7b65a36..318996784a1 100644 --- a/shim_et/xplat/executorch/codegen/codegen.bzl +++ b/shim_et/xplat/executorch/codegen/codegen.bzl @@ -535,6 +535,7 @@ def get_portable_lib_deps(): "//executorch/kernels/portable/cpu:vec_ops", "//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/runtime/core:device_allocator", ] def get_optimized_lib_deps(): diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index cc2a0f78c75..479f3913f8f 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -1405,6 +1405,12 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op__device_copy", + deps = [ + "//executorch/runtime/core:device_allocator", + ], + ), ) # Operators that are not listed in `functions.yaml` (i.e., operators listed in diff --git a/third-party/ao b/third-party/ao index 02105d46c61..01849b2b19c 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439 +Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715 diff --git a/third-party/pocketfft b/third-party/pocketfft index 81874074463..0fa0ef591e3 160000 --- a/third-party/pocketfft +++ b/third-party/pocketfft @@ -1 +1 @@ -Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7 +Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa From 9991681b1204d743e7068d7ea496e6494356ea51 Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Fri, 29 May 2026 00:49:48 -0700 Subject: [PATCH 3/7] Remove accidental files --- 1 | 1 - resnet18_log1.log | 0 2 files changed, 1 deletion(-) delete mode 100644 1 delete mode 100644 resnet18_log1.log diff --git a/1 b/1 deleted file mode 100644 index 8462b88277b..00000000000 --- a/1 +++ /dev/null @@ -1 +0,0 @@ -usage: list-sessions [-F format] diff --git a/resnet18_log1.log b/resnet18_log1.log deleted file mode 100644 index e69de29bb2d..00000000000 From 3dd55592cd2c01a5635c69931076570191ea7066 Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Fri, 29 May 2026 00:51:51 -0700 Subject: [PATCH 4/7] Sync submodule pointers with upstream/main --- shim | 2 +- third-party/ao | 2 +- third-party/pocketfft | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/shim b/shim index cf6a954aae4..b295819bb0e 160000 --- a/shim +++ b/shim @@ -1 +1 @@ -Subproject commit cf6a954aae4bee7b4515e13475878460115027d1 +Subproject commit b295819bb0ec636b4e3359828e05476d2437650a diff --git a/third-party/ao b/third-party/ao index 01849b2b19c..02105d46c61 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715 +Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439 diff --git a/third-party/pocketfft b/third-party/pocketfft index 0fa0ef591e3..81874074463 160000 --- a/third-party/pocketfft +++ b/third-party/pocketfft @@ -1 +1 @@ -Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa +Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7 From 6a46467f0b879a69f50a3b80511275b88682f1db Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Fri, 29 May 2026 00:56:03 -0700 Subject: [PATCH 5/7] Sync remaining files with upstream/main --- backends/cuda/runtime/shims/tests/targets.bzl | 24 ++ .../shims/tests/test_op__device_copy.cpp | 195 ++++++++++++ backends/mlx/builder/op_helpers.py | 2 +- backends/mlx/patterns.py | 79 ++++- backends/mlx/test/test_ops.py | 14 + .../upsample_bilinear2d_converter.py | 102 ++++-- .../upsample_nearest2d_converter.py | 110 +++++-- .../test_convert_upsample_bilinear2d.py | 283 ++++++++++++++++- .../test_convert_upsample_nearest2d.py | 141 ++++++++- backends/transforms/aten_to_dialect_pass.py | 138 +++++++++ backends/transforms/targets.bzl | 25 ++ .../test/test_aten_to_dialect_pass.py | 239 ++++++++++++++ examples/models/gemma4_31b/README.md | 1 + examples/models/gemma4_31b/export.py | 7 +- examples/models/gemma4_31b/gguf_loader.py | 19 +- examples/models/gemma4_31b/quant/README.md | 2 - examples/models/gemma4_31b/quant/pack_mlx.py | 6 +- .../gemma4_31b/quant/tests/test_pack_mlx.py | 46 ++- .../gemma4_31b/tests/test_mlx_pipeline.py | 79 +++++ .../executor_runner/nxp_executor_runner.cpp | 183 +++++------ .../AsrModuleInstrumentationTest.kt | 260 ++++++++++++++++ .../executorch/LlmLoraInstrumentationTest.kt | 291 ++++++++++++++++++ shim | 2 +- third-party/ao | 2 +- third-party/pocketfft | 2 +- 25 files changed, 2070 insertions(+), 182 deletions(-) create mode 100644 backends/cuda/runtime/shims/tests/test_op__device_copy.cpp create mode 100644 backends/transforms/aten_to_dialect_pass.py create mode 100644 backends/transforms/test/test_aten_to_dialect_pass.py create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index b68043f7feb..a54c47e979d 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -42,3 +42,27 @@ def define_common_targets(): cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch_item_bool") cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out") + + cpp_unittest( + name = "test_op__device_copy", + srcs = ["test_op__device_copy.cpp"], + deps = [ + "//executorch/backends/cuda/runtime:cuda_backend", + "//executorch/kernels/portable:generated_lib", + "//executorch/kernels/portable:generated_lib_headers", + "//executorch/kernels/portable/cpu:op__device_copy", + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/portable_type:portable_type", + "//executorch/runtime/kernel:kernel_runtime_context", + "//executorch/runtime/platform:platform", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), + ) diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp new file mode 100644 index 00000000000..4e5c5a099b7 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \ + defined(__SANITIZE_ADDRESS__) +#include +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1 +#else +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0 +#endif + +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::TensorShapeDynamism; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +struct CudaDeleter { + void operator()(void* ptr) const { + if (ptr != nullptr) { + cudaFree(ptr); + } + } +}; + +using CudaPtr = std::unique_ptr; + +CudaPtr allocate_cuda(size_t nbytes) { + void* ptr = nullptr; + const cudaError_t err = cudaMalloc(&ptr, nbytes); + EXPECT_EQ(err, cudaSuccess) << "cudaMalloc failed"; + return CudaPtr(ptr); +} + +bool is_cuda_available() { +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_disable(); +#endif + int device_count = 0; + const cudaError_t err = cudaGetDeviceCount(&device_count); +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_enable(); +#endif + return err == cudaSuccess && device_count > 0; +} + +std::vector copy_cuda_to_host(const void* device_ptr, size_t numel) { + std::vector host(numel); + const cudaError_t err = cudaMemcpy( + host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed"; + return host; +} + +void copy_host_to_cuda(const std::vector& host, void* device_ptr) { + const cudaError_t err = cudaMemcpy( + device_ptr, + host.data(), + host.size() * sizeof(float), + cudaMemcpyHostToDevice); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed"; +} + +class CudaDeviceCopyOpTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + ASSERT_NE(get_device_allocator(DeviceType::CUDA), nullptr) + << "Linking cuda_backend should auto-register the CUDA allocator"; + } + + void SetUp() override { + if (!is_cuda_available()) { + GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests"; + } + } + + Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_h2d_copy_outf(context_, self, out); + } + + Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_d2h_copy_outf(context_, self, out); + } + + KernelRuntimeContext context_; +}; + +} // namespace + +TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesRegisteredCudaAllocator) { + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto device_data = allocate_cuda(src_data.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + + int32_t sizes[] = {static_cast(src_data.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_h2d_copy_out(src, dst); + + EXPECT_EQ(context_.failure_state(), Error::Ok); + EXPECT_EQ(&result, &dst); + EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data); +} + +TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesRegisteredCudaAllocator) { + const std::vector expected = {5.0f, 6.0f, 7.0f, 8.0f}; + auto device_data = allocate_cuda(expected.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + copy_host_to_cuda(expected, device_data.get()); + + std::vector dst_data(expected.size(), 0.0f); + int32_t sizes[] = {static_cast(expected.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_d2h_copy_out(src, dst); + + EXPECT_EQ(context_.failure_state(), Error::Ok); + EXPECT_EQ(&result, &dst); + EXPECT_EQ(dst_data, expected); +} diff --git a/backends/mlx/builder/op_helpers.py b/backends/mlx/builder/op_helpers.py index 40e71e0bdab..7740546cc2c 100644 --- a/backends/mlx/builder/op_helpers.py +++ b/backends/mlx/builder/op_helpers.py @@ -334,7 +334,7 @@ def parse_dequant_node( if len(non_one) != 1: return None quantized_dim, group_size = non_one[0] - if group_size not in [32, 64, 128]: + if group_size not in [16, 32, 64, 128]: return None # TODO: MLX supports 3, 5, and 7, but we need to figure out the diff --git a/backends/mlx/patterns.py b/backends/mlx/patterns.py index 29e5e326c69..5f74cbea643 100644 --- a/backends/mlx/patterns.py +++ b/backends/mlx/patterns.py @@ -15,6 +15,7 @@ from __future__ import annotations +import os from typing import Any, List, Optional, Tuple import torch @@ -37,6 +38,7 @@ ) from executorch.backends.mlx.serialization.mlx_graph_schema import ( AddIntNode, + AddmmNode, AddNode, AsTypeNode, DequantizeNode, @@ -52,6 +54,7 @@ SubtractIntNode, SymSizeNode, TakeNode, + TransposeNode, ) from torch.export.exported_program import ExportedProgram from torch.fx.node import Node @@ -883,6 +886,18 @@ def maybe_create( out_dtype=out_dtype, ) + # MLX's quantized_matmul Metal kernels are only instantiated for + # group_size in {32, 64, 128}. For smaller group sizes (e.g. GGUF + # Q6_K with group_size=16), emit DequantizeNode + matmul instead. + # Weights stay packed in the .pte file; dequantized on-device. + # This non-fused path is significantly slower and must be opted in + # via ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1. + _MIN_FUSED_GROUP_SIZE = 32 + + @staticmethod + def _allow_non_fused() -> bool: + return os.environ.get("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", "0") == "1" + def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot: assert n == self.head @@ -908,19 +923,59 @@ def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot: x_dtype = x_node.meta["val"].dtype needs_cast = self.out_dtype != x_dtype - P.emit( - QuantizedMatmulNode( - x=P.slot_to_tid(x_slot), - w=P.slot_to_tid(w), - scales=P.slot_to_tid(scale_slot), - out=P.slot_to_tid(out), - biases=P.slot_to_tid(biases), - group_size=self.group_size, - bits=self.bits, - mode="affine", - transpose=True, + if self.group_size >= self._MIN_FUSED_GROUP_SIZE: + P.emit( + QuantizedMatmulNode( + x=P.slot_to_tid(x_slot), + w=P.slot_to_tid(w), + scales=P.slot_to_tid(scale_slot), + out=P.slot_to_tid(out), + biases=P.slot_to_tid(biases), + group_size=self.group_size, + bits=self.bits, + mode="affine", + transpose=True, + ) ) - ) + else: + if not self._allow_non_fused(): + raise ValueError( + f"Quantized linear with group_size={self.group_size} requires " + f"the non-fused dequantize+matmul path, which is significantly " + f"slower than the fused QuantizedMatmulNode (group_size >= 32). " + f"Set ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1 to allow this." + ) + out_scalar_type = torch_dtype_to_scalar_type(self.out_dtype) + _, w_deq = P.make_tmp_slot() + P.emit( + DequantizeNode( + w=P.slot_to_tid(w), + scales=P.slot_to_tid(scale_slot), + out=P.slot_to_tid(w_deq), + biases=P.slot_to_tid(biases), + group_size=self.group_size, + bits=self.bits, + mode="affine", + dtype=out_scalar_type, + ) + ) + _, w_t = P.make_tmp_slot() + P.emit( + TransposeNode( + x=P.slot_to_tid(w_deq), + out=P.slot_to_tid(w_t), + perm=[1, 0], + ) + ) + P.emit( + AddmmNode( + mat1=P.slot_to_tid(x_slot), + mat2=P.slot_to_tid(w_t), + out=P.slot_to_tid(out), + ) + ) + # DequantizeNode already produces the correct dtype. + needs_cast = False if has_bias: P.emit( diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py index 4471610519e..45ea024f0e8 100644 --- a/backends/mlx/test/test_ops.py +++ b/backends/mlx/test/test_ops.py @@ -24,6 +24,7 @@ See README.md in this directory for full documentation. """ +import os from typing import Callable, Dict, List, Optional, Tuple import torch @@ -5621,8 +5622,21 @@ def get_test_configs(cls) -> List["QuantizedLinearTest"]: cls(group_size=128), cls(qdtype=torch.int2), cls(qdtype=torch.int8), + # group_size=16: exercises the non-fused dequantize+matmul path + # (requires ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1). + cls(qdtype=torch.int8, group_size=16), + cls(qdtype=torch.int4, group_size=16), + cls(qdtype=torch.int8, group_size=16, bias=False), ] + def generate_test_files(self, verbose=False): + if self.group_size < 32: + os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1" + try: + return super().generate_test_files(verbose=verbose) + finally: + os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None) + def create_model(self) -> nn.Module: model = LinearModel(self.in_features, self.out_features, bias=self.bias) model = model.to(self.dtype) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py index 33d97dff642..1183ef494b5 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py @@ -4,11 +4,13 @@ # LICENSE file in the root directory of this source tree. import numpy as np +import torch from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT from executorch.backends.nxp.backend.edge_helper import node_has_well_defined_shape from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, + is_not_qdq_node, NodeConverter, ) from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.resize_bilinear_options import ( @@ -16,12 +18,35 @@ ) from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter # noinspection SpellCheckingInspection class UpsampleBilinear2DConverter(NodeConverter): + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + ) -> bool: + input_shape = node.all_input_nodes[0].meta["val"].shape + output_shape = node.meta["val"].shape + is_alone_in_partition = cls.is_node_alone_in_partition( + node, partition_list, filter_fn=is_not_qdq_node + ) + + if is_alone_in_partition and input_shape == output_shape: + # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the + # partition, the graph would end up empty. + return False + + return True + @staticmethod def _is_supported_in_IR( node: Node, @@ -36,6 +61,14 @@ def _is_supported_in_IR( " format. Please report this." ) + # The conversion requires the output shape to be known and static. + if not node_has_well_defined_shape(node): + return False + + if len(node.meta["val"].shape) != 4: + # Unexpected case. The input should always be 4D. + return False + return True @staticmethod @@ -45,38 +78,58 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - # Neutron requires static shapes. - # neutron-converter/src/OperatorC/UpsamplePlugin.cpp?at=NEUTRON_SOFTWARE_2.2.3#74 - if not node_has_well_defined_shape(node): - return False - - if len(node.meta["val"].shape) != 4: - # Unexpected case. The input should always be 4D. - return False - - # The tensors here use the channels first format (NCHW). + # The tensors are always 4D and use the channels first format (NCHW). _, in_c, in_h, in_w = node.all_input_nodes[0].meta["val"].shape _, _, out_h, out_w = node.meta["val"].shape - # Neutron supports only the doubling and quadrupleing of both height and width at the same time. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 - supported_scales = [2, 4] - if not any( - in_h * scale == out_h and in_w * scale == out_w - for scale in supported_scales - ): - return False - - # Neutron requires the input channels to be a multiple of `num_macs`. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#777 - if in_c % neutron_target_spec.get_num_macs() != 0: - return False + if custom_delegation_options.use_new_flow_neutron_c: + # Requirements specified by the new Neutron flow documentation. + + if not NodeConverter.uses_quantization_type_for_io( + node, + supported_types=[torch.int8, torch.uint8], + input_indices=[0], + output_indices=[0], + ): + return False + + supported_scales = [1, 2, 4, 8] + align_corners = node.args[2] + if align_corners: + if in_h == 1 or in_w == 1: + return False # Avoid division by 0. + h_scale = (out_h - 1) / (in_h - 1) + w_scale = (out_w - 1) / (in_w - 1) + else: + h_scale = out_h / in_h + w_scale = out_w / in_w + + # The H and W scales don't need to be equal, but both must be supported. + if (h_scale not in supported_scales) or (w_scale not in supported_scales): + return False + + else: + # Requirements of the old Neutron flow. + + # Neutron supports only the doubling and quadrupleing of both height and width at the same time. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 + supported_scales = [2, 4] + if not any( + in_h * scale == out_h and in_w * scale == out_w + for scale in supported_scales + ): + return False + + # Neutron requires the input channels to be a multiple of `num_macs`. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#777 + if in_c % neutron_target_spec.get_num_macs() != 0: + return False return True def convert(self, node: Node): """Convert the `aten.upsample_bilinear2d.vec` operator to Neutron IR `ResizeBilinear`. - The schema is: + The ExecuTorch schema is: aten::upsample_bilinear2d.vec( Tensor input, SymInt[]? output_size, @@ -109,6 +162,7 @@ def convert(self, node: Node): # and the second one is what NeutronIR uses when `align_corners == False and half_pixel_centers == True`. # https://github.com/tensorflow/tensorflow/blob/v2.20.0/tensorflow/lite/kernels/internal/reference/resize_bilinear.h#L82-L88 # https://github.com/tensorflow/tensorflow/blob/v2.20.0/tensorflow/lite/kernels/internal/reference/resize_bilinear.h#L172-L180 + # Also, the new Neutron flow requires that `align_corners` and `half_pixel_centers` are not True simultainiously. align_corners = node.args[2] half_pixel_centers = not align_corners t_op.builtin_options = ResizeBilinear(align_corners, half_pixel_centers) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py index 1ddc71425ef..6e18a7bfe67 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py @@ -4,11 +4,13 @@ # LICENSE file in the root directory of this source tree. import numpy as np +import torch from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT from executorch.backends.nxp.backend.edge_helper import node_has_well_defined_shape from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, + is_not_qdq_node, NodeConverter, ) from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.resize_nearest_neighbor_options import ( @@ -16,12 +18,37 @@ ) from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter +HeightScale = float +WidthScale = float + # noinspection SpellCheckingInspection class UpsampleNearest2DConverter(NodeConverter): + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + ) -> bool: + h_scale, w_scale = cls._get_effective_scales(node) + is_alone_in_partition = cls.is_node_alone_in_partition( + node, partition_list, filter_fn=is_not_qdq_node + ) + + if is_alone_in_partition and h_scale == w_scale == 1: + # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the + # partition, the graph would end up empty. + return False + + return True + @staticmethod def _is_supported_in_IR( node: Node, @@ -36,6 +63,14 @@ def _is_supported_in_IR( " format. Please report this." ) + # The conversion requires the output shape to be known and static. + if not node_has_well_defined_shape(node): + return False + + if len(node.meta["val"].shape) != 4: + # Unexpected case. The input should always be 4D. + return False + return True @staticmethod @@ -45,39 +80,62 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - # Neutron requires static shapes. - # neutron-converter/src/OperatorC/UpsamplePlugin.cpp?at=NEUTRON_SOFTWARE_2.2.3#74 - if not node_has_well_defined_shape(node): - return False - - if len(node.meta["val"].shape) != 4: - # Unexpected case. The input should always be 4D. - return False - - # The tensors here use the channels first format (NCHW). + # The tensors are always 4D and use the channels first format (NCHW). _, in_c, in_h, in_w = node.all_input_nodes[0].meta["val"].shape _, _, out_h, out_w = node.meta["val"].shape - # Neutron supports only the doubling and quadrupleing of both height and width at the same time. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#768 - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 - supported_scales = [2, 4] - if not any( - in_h * scale == out_h and in_w * scale == out_w - for scale in supported_scales - ): - return False - - # Neutron requires the input channels to be a multiple of `num_macs`. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#767 - if in_c % neutron_target_spec.get_num_macs() != 0: - return False + if custom_delegation_options.use_new_flow_neutron_c: + # Requirements specified by the new Neutron flow documentation. + + if not NodeConverter.uses_quantization_type_for_io( + node, + supported_types=[torch.int8, torch.uint8], + input_indices=[0], + output_indices=[0], + ): + return False + + supported_scales = [1, 2, 4, 8] + h_scale, w_scale = UpsampleNearest2DConverter._get_effective_scales(node) + # The H and W scales don't need to be equal but both must be supported. + if (h_scale not in supported_scales) or (w_scale not in supported_scales): + return False + + else: + # Requirements of the old Neutron flow. + + # Neutron supports only the doubling and quadrupleing of both height and width at the same time. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#768 + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 + supported_scales = [2, 4] + if not any( + in_h * scale == out_h and in_w * scale == out_w + for scale in supported_scales + ): + return False + + # Neutron requires the input channels to be a multiple of `num_macs`. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#767 + if in_c % neutron_target_spec.get_num_macs() != 0: + return False return True + @staticmethod + def _get_effective_scales(node: Node) -> tuple[HeightScale, WidthScale]: + # Neutron supports variants where `align_corners=False` and `align_corners=True`. ExecuTorch doesn't have this + # parameter. Its behavior is equivalent to `align_corners=False`. Hence, the scale calculation corresponds to + # the `align_corners=False` case in the Neutron documentation. + _, _, in_h, in_w = node.all_input_nodes[0].meta["val"].shape + _, _, out_h, out_w = node.meta["val"].shape + h_scale = out_h / in_h + w_scale = out_w / in_w + + return h_scale, w_scale + def convert(self, node: Node): """Convert the `aten.upsample_nearest2d.vec` operator to Neutron IR `ResizeNearestNeighbor`. - The schema is: + The ExecuTorch schema is: aten::upsample_nearest2d.vec( Tensor input, SymInt[]? output_size, @@ -90,6 +148,8 @@ def convert(self, node: Node): x = t_op.tmp_inputs[0] y = t_op.tmp_outputs[0] + # Neutron supports variants where `align_corners=False` and `align_corners=True`. ExecuTorch doesn't have this + # parameter. Its behavior is equivalent to `align_corners=False` and `half_pixel_centers=False`. t_op.builtin_options = ResizeNearestNeighbor(False, False) # The `aten.upsample_nearest2d` can use either the `size` attribute or the `scale_factor` to define the output diff --git a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py index 5663eea9cc3..2d2f9845fa3 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py @@ -4,12 +4,15 @@ # LICENSE file in the root directory of this source tree. import numpy as np + +# noinspection PyUnusedImports import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -17,7 +20,17 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + AllCloseOutputComparator, +) +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + AddTensor, + ExecutorchDelegateCall, + UpsampleBilinear2D, +) +from executorch.backends.nxp.tests.use_qat import * # noqa F403 @pytest.fixture(autouse=True) @@ -26,23 +39,25 @@ def reseed_model_per_test_run(): np.random.seed(23) -# noinspection PyProtectedMember -ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate -UpsampleBilinear2D = exir_ops.edge.aten.upsample_bilinear2d.vec - - class UpsampleBilinearModule(torch.nn.Module): - def __init__(self, size=None, scale=None): + def __init__(self, size=None, scale=None, **kwargs): super().__init__() self.upsample = torch.nn.Upsample( - size=size, scale_factor=scale, mode="bilinear" + size=size, scale_factor=scale, mode="bilinear", **kwargs ) def forward(self, x): return self.upsample(x) +class UpsampleBilinearAddModule(UpsampleBilinearModule): + + def forward(self, x): + x = super().forward(x) + return x + x + + @pytest.mark.parametrize( "input_shape, size", [ @@ -185,3 +200,255 @@ def test_convert_upsample_bilinear2d__no_delegation__unsupported_size( # Make sure the `upsample` was NOT delegated (size != double of input). assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D]) + + +class TestUpsampleBilinear2DNewNeutronFlow: + # TODO Use quantized dataset and `atol=1` in the tests. + + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, + model, + input_shape, + mocker, + use_qat=False, + atol=None, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {UpsampleBilinear2D: 1} + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + kwargs = {"atol": atol} if atol is not None else {} + output_comparator = AllCloseOutputComparator(**kwargs) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + output_comparator, + use_qat=use_qat, + use_new_flow_neutron_c=True, # Use the new flow. + ) + + # noinspection PyMethodMayBeStatic + def assert_not_delegated(self, model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D]) + + def test__qat__align_corners(self, mocker, use_qat): + align_corners = True + input_shape = (1, 2, 3, 4) + output_size = (5, 7) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.015 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol) + + def test__qat__not_align_corners(self, mocker, use_qat): + align_corners = False + input_shape = (1, 2, 3, 4) + output_size = (6, 8) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.015 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param((1, 2, 3, 4), (6, 8), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (3, 3, 3, 5), + (6, 5), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (3, 16), id="batch=2, scale_h=1, scale_w=4"), + pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"), + ], + ) + def test__not_align_corners__output_size(self, mocker, input_shape, output_size): + align_corners = False + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__not_align_corners__output_size__unsupported(self): + align_corners = False + input_shape = (1, 2, 3, 4) + output_size = (9, 12) # scale = (3, 3) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + pytest.param((1, 2, 3, 4), (2, 2), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (3, 3, 3, 5), + (2, 1), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (4, 1), id="batch=2, scale_h=4, scale_w=1"), + pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"), + ], + ) + def test__not_align_corners__scales(self, mocker, input_shape, scale): + align_corners = False + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__not_align_corners__scales__unsupported(self): + align_corners = False + input_shape = (1, 2, 3, 4) + scale = (3, 3) + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param((1, 2, 4, 5), (7, 9), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (1, 3, 3, 5), + (5, 5), + id="batch=1, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 4, 5), (4, 17), id="batch=2, scale_h=1, scale_w=4"), + pytest.param((1, 2, 4, 5), (25, 9), id="batch=1, scale_h=8, scale_w=2"), + ], + ) + def test__align_corners__output_size(self, mocker, input_shape, output_size): + align_corners = True + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param( + (2, 2, 4, 5), (25, 9), id="batch=2, scale_h=8, scale_w=2" + ), # Error ~= 0.47 + pytest.param( + (3, 3, 3, 5), + (5, 5), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), # Error ~= 3.7 + ], + ) + def test__align_corners__output_size__incorrect_output( + self, mocker, input_shape, output_size + ): + align_corners = True + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.45 # Huge tolerance (still not enough to pass). + with pytest.raises(AssertionError): + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__align_corners__output_size__unsupported(self): + align_corners = True + input_shape = (1, 2, 3, 4) + output_size = (6, 8) # Neutron scale = (5/2, 7/3) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + def test__align_corners__output_size__input_size_equal_to_one(self): + align_corners = True + input_shape = (1, 2, 1, 1) # Neutron scale computation would divide by zero. + output_size = (2, 2) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + # The PyTorch scales are "weird" because the "Neutron scales" are computed differently. + # The fractions correspond to "nice" Neutron scales (1, 2, 4, or 8). + pytest.param( + (1, 2, 4, 5), + (7 / 4, 9 / 5), + id="batch=1, scale_h=7/4, scale_w=9/5 (Neutron scales = (2, 2)", + ), + pytest.param( + (1, 3, 3, 5), + (5 / 3, 1), + id="batch=1, scale_h=5/3, scale_w=1 (Neutron scales = (2, 1))", + ), + pytest.param( + (2, 2, 4, 5), + (1, 17 / 5), + id="batch=2, scale_h=1, scale_w=17/5 (Neutron scales = (1, 4))", + ), + pytest.param( + (1, 2, 4, 5), + (25 / 4, 9 / 5), + id="batch=1, scale_h=25/4, scale_w=9/5 (Neutron scales = (8, 2))", + ), + ], + ) + def test__align_corners__scales(self, mocker, input_shape, scale): + align_corners = True + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + pytest.param( + (2, 2, 4, 5), + (25 / 4, 9 / 5), + id="batch=3, scale_h=25/4, scale_w=9/5 (Neutron scales = (8, 2))", + ), # Error ~= 0.47 + pytest.param( + (3, 3, 3, 5), + (5 / 3, 1), + id="batch=3, scale_h=5/3, scale_w=1 (Neutron scales = (2, 1))", + ), # Error ~= 3.7 + ], + ) + def test__align_corners__scales__incorrect_output(self, mocker, input_shape, scale): + align_corners = True + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + atol = 0.45 # Huge tolerance (still not enough to pass). + with pytest.raises(AssertionError): + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__align_corners__scales__unsupported(self): + align_corners = True + input_shape = (1, 2, 3, 4) + scale = (2, 2) # Neutron scale = (5/2, 7/3) + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + def test__noop__alone_in_partition__not_delegated(self): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleBilinearModule(scale=scale) + self.assert_not_delegated(model, input_shape) + + def test__noop__not_alone_in_partition__delegated(self, mocker): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleBilinearAddModule(scale=scale) + self.assert_delegated( + model, + input_shape, + mocker, + expected_delegated_ops={UpsampleBilinear2D: 1, AddTensor: 1}, + ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py index 3d9ec84dec9..27d1ac718a0 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py @@ -4,12 +4,15 @@ # LICENSE file in the root directory of this source tree. import numpy as np + +# noinspection PyUnusedImports import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -17,7 +20,14 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + AddTensor, + ExecutorchDelegateCall, + UpsampleNearest2D, +) +from executorch.backends.nxp.tests.use_qat import * # noqa F403 @pytest.fixture(autouse=True) @@ -26,11 +36,6 @@ def reseed_model_per_test_run(): np.random.seed(23) -# noinspection PyProtectedMember -ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate -UpsampleNearest2D = exir_ops.edge.aten.upsample_nearest2d.vec - - class UpsampleNearestModule(torch.nn.Module): def __init__(self, size=None, scale=None): @@ -41,6 +46,13 @@ def forward(self, x): return self.upsample(x) +class UpsampleNearestAddModule(UpsampleNearestModule): + + def forward(self, x): + x = super().forward(x) + return x + x + + @pytest.mark.parametrize( "input_shape, size", [ @@ -181,3 +193,120 @@ def test_convert_upsample_nearest2d__no_delegation__unsupported_size(input_shape # Make sure the `upsample` was NOT delegated (size != double of input). assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D]) + + +class TestUpsampleNearest2DNewNeutronFlow: + + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, + model, + input_shape, + mocker, + use_qat=False, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {UpsampleNearest2D: 1} + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + use_qat=use_qat, + use_new_flow_neutron_c=True, # Use the new flow. + ) + + # noinspection PyMethodMayBeStatic + def assert_not_delegated(self, model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D]) + + def test__qat(self, mocker, use_qat): + input_shape = (1, 2, 3, 4) + output_size = (6, 8) + model = UpsampleNearestModule(size=output_size) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param((1, 2, 3, 4), (6, 8), id="batch=1, scale_h=scale_w=2"), + pytest.param((1, 2, 3, 3), 6, id="batch=1, scale_h=scale_w=2, scalar size"), + pytest.param( + (3, 3, 3, 5), + (6, 5), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (3, 16), id="batch=2, scale_h=1, scale_w=4"), + pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"), + ], + ) + def test__output_size(self, mocker, input_shape, output_size): + model = UpsampleNearestModule(size=output_size) + self.assert_delegated(model, input_shape, mocker) + + def test__output_size__unsupported(self): + input_shape = (1, 2, 3, 4) + output_size = (9, 12) # scale = (3, 3) + model = UpsampleNearestModule(size=output_size) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + pytest.param((1, 2, 3, 4), (2, 2), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (1, 2, 3, 4), 4, id="batch=1, scale_h=scale_w=4, scalar scale" + ), + pytest.param( + (3, 3, 3, 5), + (2, 1), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (4, 1), id="batch=2, scale_h=4, scale_w=1"), + pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"), + ], + ) + def test__scales(self, mocker, input_shape, scale): + model = UpsampleNearestModule(scale=scale) + self.assert_delegated(model, input_shape, mocker) + + def test__scales__unsupported(self): + input_shape = (1, 2, 3, 4) + scale = (3, 3) + model = UpsampleNearestModule(scale=scale) + self.assert_not_delegated(model, input_shape) + + def test__noop__alone_in_partition__not_delegated(self): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleNearestModule(scale=scale) + self.assert_not_delegated(model, input_shape) + + def test__noop__not_alone_in_partition__delegated(self, mocker): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleNearestAddModule(scale=scale) + self.assert_delegated( + model, + input_shape, + mocker, + expected_delegated_ops={UpsampleNearest2D: 1, AddTensor: 1}, + ) diff --git a/backends/transforms/aten_to_dialect_pass.py b/backends/transforms/aten_to_dialect_pass.py new file mode 100644 index 00000000000..f31df73bc58 --- /dev/null +++ b/backends/transforms/aten_to_dialect_pass.py @@ -0,0 +1,138 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import traceback +from collections.abc import Callable +from dataclasses import dataclass +from typing import ClassVar, TypeAlias + +import torch + +from executorch.backends.xnnpack._passes.xnnpack_pass import ExportPass + +from executorch.exir import ExportedProgram +from torch.fx.node import Target +from torch.fx.passes.infra.pass_manager import PassResult + + +# Expected type to be returned by substitution functions. +@dataclass +class DialectNodeSpec: + op: Target + args: tuple + kwargs: dict = None + + +# Expected type to be used for substitution functions +SubstitutionFn: TypeAlias = Callable[ + [torch.fx.Node, torch.export.ExportedProgram], DialectNodeSpec | None +] + + +class AtenToDialectPass(ExportPass): + """ + General pass to convert ops 1-1 from ATen to a specific dialect. + + Usage: + 1. Subclass the pass for a specific dialect + 2. For each ATen target to be substituted, implement a function returning a DialectNodeSpec defining the + corresponding dialect op, or None if the substitution does not apply. + 3. Register each substitution function for the subclass using the decorator register_dialect_substitution + + Only one substitution function can be registered for a given target. + + The pass must be initialized with an exported_program to allow substitution functions to modify placeholders, + e.g. if the dialect ops require additional scratch buffers. + """ + + _DIALECT_SUBSTITUTIONS: ClassVar[dict[Target, SubstitutionFn]] = {} + + def __init__(self, exported_program: ExportedProgram): + super().__init__() + self.exported_program: ExportedProgram = exported_program + + # Ensure each subclass has its own substitution registry. + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls._DIALECT_SUBSTITUTIONS = {} + + @classmethod + def register_dialect_substitution( + cls, target: Target + ) -> Callable[[SubstitutionFn], SubstitutionFn]: + + def decorator(func: SubstitutionFn) -> SubstitutionFn: + if target in cls._DIALECT_SUBSTITUTIONS: + raise RuntimeError( + f"Multiple substitutions registered for the same target in {cls.__name__} are not allowed." + ) + else: + cls._DIALECT_SUBSTITUTIONS[target] = func + return func + + return decorator + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + + for node in graph_module.graph.nodes: + if node.op != "call_function": + continue + + substitution_func = self._DIALECT_SUBSTITUTIONS.get(node.target, None) + if substitution_func is None: + continue + + dialect_node_spec = substitution_func(node, self.exported_program) + if dialect_node_spec is None: + continue + + modified = True + with graph_module.graph.inserting_before(node): + dialect_node = graph_module.graph.create_node( + "call_function", + target=dialect_node_spec.op, + args=dialect_node_spec.args, + kwargs=dialect_node_spec.kwargs or {}, + ) + + node.replace_all_uses_with(dialect_node) + + # Keep same meta dict for new node and append new trace + dialect_node.meta = node.meta + old_stack_trace = dialect_node.meta.get("stack_trace", "") + dialect_node.meta["stack_trace"] = ( + f"{old_stack_trace}\n{traceback.format_stack()[-2]}" + ) + + graph_module.graph.erase_node(node) + + if modified: + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + + return PassResult(graph_module, modified) + + def requires(self, graph_module): + self.ops_before = sum( + 1 for node in graph_module.graph.nodes if node.op == "call_function" + ) + return super().requires(graph_module) + + def ensures(self, graph_module: torch.fx.GraphModule) -> bool: + """Ensure that there has only been 1-1 substitution of call_function nodes, i.e. that the number of call_function nodes is preserved after the pass.""" + + self.ops_after = sum( + 1 for node in graph_module.graph.nodes if node.op == "call_function" + ) + if self.ops_after != self.ops_before: + raise RuntimeError( + f"{self.__class__.__name__} did not preserve the number of call_function nodes: " + f"before={self.ops_before}, after={self.ops_after}" + ) + + return super().ensures(graph_module) diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl index 8c3603e293d..36466ec4aa0 100644 --- a/backends/transforms/targets.bzl +++ b/backends/transforms/targets.bzl @@ -176,6 +176,21 @@ def define_common_targets(): ], ) + runtime.python_library( + name = "aten_to_dialect_pass", + srcs = [ + "aten_to_dialect_pass.py", + ], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/xnnpack/_passes:xnnpack_passes", + "//executorch/exir:lib", + ], + ) + runtime.python_library( name = "rank_0_to_rank_1", srcs = [ @@ -243,6 +258,16 @@ def define_common_targets(): ], ) + runtime.python_test( + name = "test_aten_to_dialect_pass", + srcs = [ + "test/test_aten_to_dialect_pass.py", + ], + deps = [ + "//caffe2:torch", + ":aten_to_dialect_pass", + ], + ) runtime.python_test( name = "test_rank_0_to_rank_1", diff --git a/backends/transforms/test/test_aten_to_dialect_pass.py b/backends/transforms/test/test_aten_to_dialect_pass.py new file mode 100644 index 00000000000..80dbf210d72 --- /dev/null +++ b/backends/transforms/test/test_aten_to_dialect_pass.py @@ -0,0 +1,239 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from executorch.backends.transforms.aten_to_dialect_pass import ( + AtenToDialectPass, + DialectNodeSpec, +) +from executorch.backends.transforms.utils import create_constant_placeholder +from torch.export import ExportedProgram +from torch.export.graph_signature import InputKind +from torch.fx import Node + + +class AddModel(torch.nn.Module): + def forward(self, x, y): + return torch.ops.aten.add.Tensor(x, y) + + +class AddAlphaModel(torch.nn.Module): + def forward(self, x, y): + return torch.ops.aten.add.Tensor(x, y, alpha=2) + + +def _count_target(graph_module: torch.fx.GraphModule, target) -> int: + return sum( + 1 + for node in graph_module.graph.nodes + if node.op == "call_function" and node.target == target + ) + + +def _get_target_node(graph_module: torch.fx.GraphModule, target) -> Node: + nodes = [ + node + for node in graph_module.graph.nodes + if node.op == "call_function" and node.target == target + ] + assert len(nodes) == 1 + return nodes[0] + + +def _export_add_model() -> ExportedProgram: + return torch.export.export( + AddModel().eval(), (torch.randn(2, 3), torch.randn(2, 3)), strict=True + ) + + +def _export_add_alpha_model() -> ExportedProgram: + return torch.export.export( + AddAlphaModel().eval(), (torch.randn(2, 3), torch.randn(2, 3)), strict=True + ) + + +def test_rewrites_node_when_substitution_matches() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_with_sub( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args) + + exported_program = _export_add_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert result.modified + assert _count_target(result.graph_module, torch.ops.aten.add.Tensor) == 0 + assert _count_target(result.graph_module, torch.ops.aten.sub.Tensor) == 1 + + +def test_substitution_can_add_state_dict_placeholder() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_rhs_with_constant( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + first_placeholder = next( + graph_node + for graph_node in node.graph.nodes + if graph_node.op == "placeholder" + ) + with node.graph.inserting_before(first_placeholder): + const_node = create_constant_placeholder( + exp_program=exported_program, + graph=node.graph, + name="test_constant", + kind=InputKind.PARAMETER, + data=torch.ones(2, 3), + ) + return DialectNodeSpec(torch.ops.aten.add.Tensor, (node.args[0], const_node)) + + exported_program = _export_add_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert result.modified + assert "test_constant" in exported_program.state_dict + assert torch.equal(exported_program.state_dict["test_constant"], torch.ones(2, 3)) + assert ( + exported_program.graph_signature.inputs_to_parameters["test_constant"] + == "test_constant" + ) + add_node = _get_target_node(result.graph_module, torch.ops.aten.add.Tensor) + assert add_node.args[1].name == "test_constant" + + x = torch.full((2, 3), 2.0) + y = torch.full((2, 3), 5.0) + torch.testing.assert_close(exported_program.module()(x, y), x + torch.ones_like(x)) + + +def test_substitution_can_change_kwargs() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_alpha( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.add.Tensor, node.args, {"alpha": 3}) + + exported_program = _export_add_alpha_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert result.modified + add_node = _get_target_node(result.graph_module, torch.ops.aten.add.Tensor) + assert add_node.kwargs["alpha"] == 3 + + x = torch.full((2, 3), 2.0) + y = torch.full((2, 3), 5.0) + torch.testing.assert_close(exported_program.module()(x, y), x + 3 * y) + + +def test_preserves_meta_when_substitution_matches() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_with_sub( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args) + + exported_program = _export_add_model() + add_node = _get_target_node( + exported_program.graph_module, torch.ops.aten.add.Tensor + ) + add_node.meta["test_sentinel"] = "kept" + add_node.meta["stack_trace"] = "original stack" + + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + sub_node = _get_target_node(result.graph_module, torch.ops.aten.sub.Tensor) + assert sub_node.meta["test_sentinel"] == "kept" + assert sub_node.meta["stack_trace"].startswith("original stack\n") + assert sub_node.meta["stack_trace"] != "original stack" + + +def test_keeps_node_when_substitution_returns_none() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def do_not_replace( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del node, exported_program + return None + + exported_program = _export_add_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert not result.modified + assert _count_target(result.graph_module, torch.ops.aten.add.Tensor) == 1 + assert _count_target(result.graph_module, torch.ops.aten.sub.Tensor) == 0 + + +def test_raises_when_duplicate_substitution_is_registered() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def first_replace( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args) + + with pytest.raises(RuntimeError, match="Multiple substitutions registered"): + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def second_replace( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.mul.Tensor, node.args) + + +def test_ensures_raises_when_call_function_count_changes() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + exported_program = _export_add_model() + graph_module = exported_program.graph_module + test_pass = _TestAtenToDialectPass(exported_program=exported_program) + test_pass.requires(graph_module) + + placeholders = [ + node for node in graph_module.graph.nodes if node.op == "placeholder" + ] + output_node = next(node for node in graph_module.graph.nodes if node.op == "output") + with graph_module.graph.inserting_before(output_node): + graph_module.graph.create_node( + "call_function", + target=torch.ops.aten.sub.Tensor, + args=tuple(placeholders), + kwargs={}, + ) + + with pytest.raises(RuntimeError, match="did not preserve"): + test_pass.ensures(graph_module) diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md index da4aa893079..c6ac10748d8 100644 --- a/examples/models/gemma4_31b/README.md +++ b/examples/models/gemma4_31b/README.md @@ -15,6 +15,7 @@ both export and eager inference: |---|---|---| | `quantize_and_save.py` | bf16 HF checkpoint → quantized checkpoint (one-time) | ~30 GB CPU | | `export.py --prequantized ` | quantized checkpoint → `model.pte` + `model.ptd` | ~24 GB CPU + CUDA for packing | +| `export.py --gguf [--backend mlx]` | GGUF file (Q4_K_M, etc.) → `model.pte` + `model.ptd` | ~24 GB CPU | | `inference.py --prequantized ` | quantized checkpoint → eager generation under `torch.compile` | ~24 GB GPU | | `inference.py --gguf ` | GGUF file (Q4_K_M, etc.) → eager generation | ~24 GB GPU | | `export.py --model-dir ` | one-shot bf16 → quantize → export (no intermediate file) | ~30 GB CPU + CUDA for packing | diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index 046e365947b..bd648f534b5 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -443,7 +443,12 @@ def main() -> None: backend=args.backend, ) - export_and_lower(model, config, args.output_dir, backend=args.backend) + if args.gguf and args.backend == "mlx": + os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1" + try: + export_and_lower(model, config, args.output_dir, backend=args.backend) + finally: + os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None) if __name__ == "__main__": diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py index 3e50991e553..35dddb5a0dc 100644 --- a/examples/models/gemma4_31b/gguf_loader.py +++ b/examples/models/gemma4_31b/gguf_loader.py @@ -12,6 +12,7 @@ Usage: model, config = load_gguf_model("model.gguf", backend="cuda") + model, config = load_gguf_model("model.gguf", backend="mlx") """ from typing import Optional @@ -104,10 +105,11 @@ def load_gguf_model( Streams tensors one at a time for low peak memory. GGUF ties ``embed_tokens`` and ``lm_head`` into a single Q4_K tensor. - We untie them: the embedding is dequantized to bf16 (``nn.Embedding`` - needs gather, which ``Int4TilePackedTo4dTensor`` does not support), - while ``lm_head`` keeps the original Q4_K quantization (``nn.Linear`` - matmul via tinygemm). + We untie them so ``lm_head`` keeps the original Q4_K quantization. + On CUDA, the embedding is dequantized to bf16 because ``Int4Tensor`` + does not support the gather op that ``nn.Embedding`` requires. On + MLX, the embedding stays quantized — ``QuantizedEmbeddingHandler`` + handles quantized gather natively. Returns ``(model, config)``. """ @@ -120,8 +122,12 @@ def load_gguf_model( from executorch.examples.models.gemma4_31b.quant import DEFAULT_CUDA_PACKERS packers = DEFAULT_CUDA_PACKERS + elif backend == "mlx": + from executorch.examples.models.gemma4_31b.quant import DEFAULT_MLX_PACKERS + + packers = DEFAULT_MLX_PACKERS else: - raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda'.") + raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda', 'mlx'.") config = Gemma4_31BConfig(max_seq_len=max_seq_len) @@ -143,7 +149,8 @@ def load_gguf_model( if model_key == "embed_tokens.weight" and isinstance(result, Int4Tensor): embed_quant = result - result = dequantize_weight(result, torch.bfloat16) + if backend == "cuda": + result = dequantize_weight(result, torch.bfloat16) pack_one(model, model_key, result, packers) diff --git a/examples/models/gemma4_31b/quant/README.md b/examples/models/gemma4_31b/quant/README.md index 2eacced4387..92ddbf97243 100644 --- a/examples/models/gemma4_31b/quant/README.md +++ b/examples/models/gemma4_31b/quant/README.md @@ -50,5 +50,3 @@ The format is compatible with torchao's `save_pretrained` / `load_pretrained`. - `pack_metal.py` — Metal backend packer. - `gguf.py` — extend with Q5_K, Q8_0 GGUF quant types. -- Upstream `Int4TilePackedTo4dTensor.from_int4_tensor()` to torchao - to replace the manual conversion in `pack_int4_for_cuda`. diff --git a/examples/models/gemma4_31b/quant/pack_mlx.py b/examples/models/gemma4_31b/quant/pack_mlx.py index 63aeca426a8..d627c9c437c 100644 --- a/examples/models/gemma4_31b/quant/pack_mlx.py +++ b/examples/models/gemma4_31b/quant/pack_mlx.py @@ -22,7 +22,7 @@ from .pack import ModulePackerFn, pack_model # noqa: F401 -_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32) +_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32, 16) # --------------------------------------------------------------------------- @@ -126,7 +126,9 @@ def pack_for_mlx(module: nn.Module, weights: dict[str, torch.Tensor]) -> None: default dispatch produces the ``dequantize_affine → linear`` pattern MLX expects. Regroups to a compatible group_size when needed (e.g. per-axis group_size=5376 → group_size=128) since MLX's - ``parse_dequant_node`` only accepts group_size in {32, 64, 128}. + ``parse_dequant_node`` only accepts group_size in {16, 32, 64, 128}. + Group sizes ≥ 32 use the fused ``QuantizedMatmulNode``; group_size=16 + (e.g. GGUF Q6_K) falls back to ``DequantizeNode`` + matmul at export. """ from torchao.quantization import IntxUnpackedToInt8Tensor from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor diff --git a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py index ffb2e0e2dd3..2e6310b9c10 100644 --- a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py +++ b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py @@ -146,7 +146,7 @@ def test_regroup_preserves_dequant(self): class TestMlxGroupSize(unittest.TestCase): def test_passthrough(self): - for gs in (32, 64, 128): + for gs in (16, 32, 64, 128): self.assertEqual(_mlx_group_size(gs, 256), gs) def test_regroup_5376(self): @@ -157,7 +157,49 @@ def test_regroup_256(self): def test_rejects_indivisible(self): with self.assertRaises(ValueError): - _mlx_group_size(48, 48) + _mlx_group_size(7, 7) + + +class TestPackLinearGroupSize16(unittest.TestCase): + """Packing group_size=16 weights (GGUF Q6_K) preserves semantics.""" + + def _make_gs16_tensor(self, N=64, K=128): + from torchao.quantization import IntxUnpackedToInt8Tensor + + return IntxUnpackedToInt8Tensor( + qdata=torch.randint(-32, 31, (N, K), dtype=torch.int8), + scale=torch.randn(N, K // 16, dtype=torch.bfloat16), + zero_point=torch.zeros(N, K // 16, dtype=torch.int8), + target_dtype=torch.int8, + block_size=(1, 16), + dtype=torch.bfloat16, + activation_quantization=None, + ) + + def test_dequant_preserves_values(self): + """Packing preserves the dequantized weight values.""" + w = self._make_gs16_tensor(64, 128) + before = dequantize_weight(w, torch.float32) + + module = nn.Linear(128, 64, bias=False) + pack_for_mlx(module, {"weight": w}) + after = dequantize_weight(module.weight.data, torch.float32) + + self.assertTrue( + torch.allclose(before, after, atol=1e-5), + f"max diff: {(before - after).abs().max():.6g}", + ) + + def test_forward_produces_valid_output(self): + """Packed gs=16 weight produces finite output in a linear forward.""" + w = self._make_gs16_tensor(64, 128) + module = nn.Linear(128, 64, bias=False) + pack_for_mlx(module, {"weight": w}) + + x = torch.randn(1, 128, dtype=torch.bfloat16) + out = torch.nn.functional.linear(x, module.weight.data.dequantize()) + self.assertEqual(out.shape, torch.Size([1, 64])) + self.assertFalse(torch.isnan(out).any()) class TestPackEmbeddingForMlx(unittest.TestCase): diff --git a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py index 0e62ab88e4b..37f61fddb0f 100644 --- a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py +++ b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py @@ -244,5 +244,84 @@ def test_export_to_pte(self): self.assertTrue(os.path.exists(os.path.join(out_dir, "model.pte"))) +class TestGgufMlxPipeline(unittest.TestCase): + """Test GGUF → MLX loading path with synthetic Q6_K-like tensors.""" + + def test_load_gguf_model_mlx_backend(self): + """gguf_loader.load_gguf_model accepts backend='mlx'.""" + try: + import gguf # noqa: F401 + except ModuleNotFoundError: + self.skipTest("gguf package not installed") + + from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model + + # Will fail on missing file, but NOT on "Unsupported backend". + with self.assertRaisesRegex((FileNotFoundError, OSError, RuntimeError), ".*"): + load_gguf_model("/nonexistent.gguf", backend="mlx") + + def test_mlx_backend_rejects_unknown(self): + from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model + + with self.assertRaisesRegex(ValueError, "Unsupported backend"): + load_gguf_model("/nonexistent.gguf", backend="tpu") + + def test_gs16_packing_preserves_values(self): + """Q6_K-like weight (gs=16) preserves dequantized values after packing.""" + from executorch.examples.models.gemma4_31b.quant.pack_mlx import pack_for_mlx + from executorch.examples.models.gemma4_31b.quant.quantize import ( + dequantize_weight, + ) + from torchao.quantization import IntxUnpackedToInt8Tensor + + w = IntxUnpackedToInt8Tensor( + qdata=torch.randint(-32, 31, (64, 128), dtype=torch.int8), + scale=torch.randn(64, 8, dtype=torch.bfloat16), + zero_point=torch.zeros(64, 8, dtype=torch.int8), + target_dtype=torch.int8, + block_size=(1, 16), + dtype=torch.bfloat16, + activation_quantization=None, + ) + before = dequantize_weight(w, torch.float32) + + module = nn.Linear(128, 64, bias=False) + pack_for_mlx(module, {"weight": w}) + after = dequantize_weight(module.weight.data, torch.float32) + + self.assertTrue( + torch.allclose(before, after, atol=1e-5), + f"max diff: {(before - after).abs().max():.6g}", + ) + + def test_embedding_packing_preserves_values(self): + """MLX embedding packing preserves dequantized weight values.""" + from executorch.examples.models.gemma4_31b.quant.pack_mlx import pack_for_mlx + from executorch.examples.models.gemma4_31b.quant.quantize import ( + dequantize_weight, + ) + from torchao.quantization import IntxUnpackedToInt8Tensor + + w = IntxUnpackedToInt8Tensor( + qdata=torch.randint(-8, 7, (256, 128), dtype=torch.int8), + scale=torch.randn(256, 4, dtype=torch.bfloat16), + zero_point=torch.zeros(256, 4, dtype=torch.bfloat16), + target_dtype=torch.int4, + block_size=(1, 32), + dtype=torch.bfloat16, + activation_quantization=None, + ) + before = dequantize_weight(w, torch.float32) + + module = nn.Embedding(256, 128) + pack_for_mlx(module, {"weight": w}) + after = dequantize_weight(module.weight.data, torch.float32) + + self.assertTrue( + torch.allclose(before, after, atol=1e-5), + f"max diff: {(before - after).abs().max():.6g}", + ) + + if __name__ == "__main__": unittest.main() diff --git a/examples/nxp/executor_runner/nxp_executor_runner.cpp b/examples/nxp/executor_runner/nxp_executor_runner.cpp index 65f5831e5c5..52d7c778227 100644 --- a/examples/nxp/executor_runner/nxp_executor_runner.cpp +++ b/examples/nxp/executor_runner/nxp_executor_runner.cpp @@ -384,71 +384,30 @@ int main(int argc, char* argv[]) { torch::executor::MemoryManager memory_manager( &method_allocator, &planned_memory, &tmp_allocator); - Result method = - program->load_method(method_name, &memory_manager); - if (!method.ok()) { - fprintf( - stderr, - "Loading of method (%s) failed with status %" PRIu32 "...\n", - method_name, - (unsigned int)method.error()); - exit(-1); - } - printf("Method loaded...\n"); - - Error status = Error::Ok; - if (!FLAGS_dataset.empty()) { - // Go through entire dataset for this model. - FLAGS_dataset += "/"; - while (dataset = readdir(datasetDir)) { - if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) - continue; - - std::vector inputsData; - inputsData.push_back(FLAGS_dataset + dataset->d_name); - // Set input and call inferrence. - setInputs(method.get(), inputsData); - - status = method->execute(); - if (status != Error::Ok) { - fprintf( - stderr, - "Execution of method %s failed with status %" PRIu32 "...\n", - method_name, - (unsigned int)status); - exit(-1); - } else { - printf("Method executed successfully...\n"); - } - - // Save outputs in binary files. - saveOutputs(method.get(), FLAGS_output, dataset->d_name); - // Print result with highest confidence. - printOutput(method.get(), FLAGS_output, dataset->d_name); + { + Result method = + program->load_method(method_name, &memory_manager); + if (!method.ok()) { + fprintf( + stderr, + "Loading of method (%s) failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)method.error()); + exit(-1); } - closedir(datasetDir); - } else if (!FLAGS_inputs.empty()) { - std::vector inputPaths; - - // Validate and process inputs and separate into two lists. - processInputs(inputPaths, FLAGS_inputs); - - if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) { - // Inputs are in directories - use files in each directory as the inputs. - std::vector inputsData; - for (std::string& inputDir : inputPaths) { - datasetDir = opendir(inputDir.c_str()); - while (dataset = readdir(datasetDir)) { - if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) - continue; - - inputsData.push_back(inputDir + "/" + dataset->d_name); - } - closedir(datasetDir); - - // Sort inputsData to ensure correct input ordering - std::sort(inputsData.begin(), inputsData.end()); - + printf("Method loaded...\n"); + + Error status = Error::Ok; + if (!FLAGS_dataset.empty()) { + // Go through entire dataset for this model. + FLAGS_dataset += "/"; + while (dataset = readdir(datasetDir)) { + if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) + continue; + + std::vector inputsData; + inputsData.push_back(FLAGS_dataset + dataset->d_name); + // Set input and call inferrence. setInputs(method.get(), inputsData); status = method->execute(); @@ -463,37 +422,81 @@ int main(int argc, char* argv[]) { printf("Method executed successfully...\n"); } - if (inputDir.back() == '/') - inputDir.pop_back(); - - auto pos = inputDir.find_last_of('/'); - if (pos != std::string::npos) - inputDir = inputDir.substr(pos + 1); - // Save outputs in binary files. - saveOutputs(method.get(), FLAGS_output, inputDir.c_str()); - inputsData.clear(); + saveOutputs(method.get(), FLAGS_output, dataset->d_name); + // Print result with highest confidence. + printOutput(method.get(), FLAGS_output, dataset->d_name); } - } else { - // Inputs are files. - setInputs(method.get(), inputPaths); - - status = method->execute(); - if (status != Error::Ok) { - fprintf( - stderr, - "Execution of method %s failed with status %" PRIu32 "...\n", - method_name, - (unsigned int)status); - exit(-1); + closedir(datasetDir); + } else if (!FLAGS_inputs.empty()) { + std::vector inputPaths; + + // Validate and process inputs and separate into two lists. + processInputs(inputPaths, FLAGS_inputs); + + if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) { + // Inputs are in directories - use files in each directory as the + // inputs. + std::vector inputsData; + for (std::string& inputDir : inputPaths) { + datasetDir = opendir(inputDir.c_str()); + while (dataset = readdir(datasetDir)) { + if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) + continue; + + inputsData.push_back(inputDir + "/" + dataset->d_name); + } + closedir(datasetDir); + + // Sort inputsData to ensure correct input ordering + std::sort(inputsData.begin(), inputsData.end()); + + setInputs(method.get(), inputsData); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } + + if (inputDir.back() == '/') + inputDir.pop_back(); + + auto pos = inputDir.find_last_of('/'); + if (pos != std::string::npos) + inputDir = inputDir.substr(pos + 1); + + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output, inputDir.c_str()); + inputsData.clear(); + } } else { - printf("Method executed successfully...\n"); - } + // Inputs are files. + setInputs(method.get(), inputPaths); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } - // Save outputs in binary files. - saveOutputs(method.get(), FLAGS_output); + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output); + } } - } + } // Destruct the method object before destroying the Neutron Device. printf("Finished...\n"); diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt new file mode 100644 index 00000000000..fe8a168e406 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt @@ -0,0 +1,260 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +package org.pytorch.executorch + +import androidx.test.ext.junit.runners.AndroidJUnit4 +import java.io.File +import java.io.IOException +import org.apache.commons.io.FileUtils +import org.junit.Assert.assertEquals +import org.junit.Assert.assertFalse +import org.junit.Assert.assertTrue +import org.junit.Assert.fail +import org.junit.Assume.assumeNotNull +import org.junit.Test +import org.junit.runner.RunWith +import org.pytorch.executorch.TestFileUtils.getTestFilePath +import org.pytorch.executorch.extension.asr.AsrCallback +import org.pytorch.executorch.extension.asr.AsrModule +import org.pytorch.executorch.extension.asr.AsrTranscribeConfig + +/** + * Instrumentation tests for [AsrModule], [AsrTranscribeConfig], and [AsrCallback]. + * + * Tests cover: + * - Constructor validation (invalid model/tokenizer/preprocessor paths) + * - AsrTranscribeConfig builder and validation + * - Lifecycle (close idempotency, use-after-close) + * - Transcribe validation (invalid WAV path) + * + * The test fixture is the TinyStories-110M LLM model, NOT an ASR model, so functional transcription + * tests are not possible. Tests that require a valid AsrModule instance handle the case where + * nativeCreate fails (stories.pte lacks encoder/text_decoder methods). + */ +@RunWith(AndroidJUnit4::class) +class AsrModuleInstrumentationTest { + + // ─── Constructor validation ───────────────────────────────────────────────── + + @Test(timeout = 30_000) + fun testInvalidModelPathThrows() { + try { + AsrModule("/nonexistent/model.pte", "/nonexistent/tokenizer") + fail("Should throw for invalid model path") + } catch (_: IllegalArgumentException) { + // Expected: require(modelFile.canRead() && modelFile.isFile) + } + } + + @Test(timeout = 30_000) + fun testInvalidTokenizerPathThrows() { + val modelFile = provisionModelFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + try { + AsrModule(modelFile!!.absolutePath, "/nonexistent/tokenizer") + fail("Should throw for invalid tokenizer path") + } catch (_: IllegalArgumentException) { + // Expected: require(tokenizerFile.exists()) + } + } + + @Test(timeout = 30_000) + fun testInvalidPreprocessorPathThrows() { + val modelFile = provisionModelFile() + val tokenizerFile = provisionTokenizerFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile) + try { + AsrModule( + modelFile!!.absolutePath, + tokenizerFile!!.absolutePath, + preprocessorPath = "/nonexistent/preprocessor.pte", + ) + fail("Should throw for invalid preprocessor path") + } catch (_: IllegalArgumentException) { + // Expected: require(preprocessorFile.canRead() && preprocessorFile.isFile) + } + } + + @Test(timeout = 30_000) + fun testNonAsrModelFailsGracefully() { + val modelFile = provisionModelFile() + val tokenizerFile = provisionTokenizerFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile) + try { + val module = AsrModule(modelFile!!.absolutePath, tokenizerFile!!.absolutePath) + // If construction succeeds (model was accepted), verify basic state + assertTrue("Module should be valid after construction", module.isValid) + module.close() + } catch (_: ExecutorchRuntimeException) { + // Expected: nativeCreate returns 0 for non-ASR model + } catch (_: RuntimeException) { + // Also acceptable: native layer rejects the model + } + } + + // ─── Lifecycle ────────────────────────────────────────────────────────────── + + @Test(timeout = 30_000) + fun testCloseIsIdempotent() { + val module = tryCreateAsrModule() ?: return + module.close() + module.close() + module.close() + assertFalse("isValid must be false after close", module.isValid) + } + + @Test(timeout = 30_000) + fun testLoadAfterCloseThrows() { + val module = tryCreateAsrModule() ?: return + module.close() + try { + module.load() + fail("load() after close() must throw IllegalStateException") + } catch (_: IllegalStateException) { + // Expected + } + } + + @Test(timeout = 30_000) + fun testTranscribeAfterCloseThrows() { + val module = tryCreateAsrModule() ?: return + module.close() + try { + module.transcribe("/some/audio.wav") + fail("transcribe() after close() must throw IllegalStateException") + } catch (_: IllegalStateException) { + // Expected + } + } + + @Test(timeout = 30_000) + fun testIsValidAndIsLoadedState() { + val module = tryCreateAsrModule() ?: return + assertTrue("Module should be valid after construction", module.isValid) + module.close() + assertFalse("Module should not be valid after close", module.isValid) + assertFalse("Module should not be loaded after close", module.isLoaded) + } + + // ─── Transcribe validation ────────────────────────────────────────────────── + + @Test(timeout = 30_000) + fun testTranscribeInvalidWavPathThrows() { + val module = tryCreateAsrModule() ?: return + try { + module.transcribe("/nonexistent/audio.wav") + fail("transcribe() with invalid WAV path must throw") + } catch (_: IllegalArgumentException) { + // Expected: require(wavFile.canRead() && wavFile.isFile) + } finally { + module.close() + } + } + + // ─── AsrTranscribeConfig ──────────────────────────────────────────────────── + + @Test + fun testConfigDefaults() { + val config = AsrTranscribeConfig() + assertEquals(128L, config.maxNewTokens) + assertEquals(0.0f, config.temperature, 0.0f) + assertEquals(0L, config.decoderStartTokenId) + } + + @Test + fun testConfigBuilder() { + val config = + AsrTranscribeConfig.Builder() + .setMaxNewTokens(256) + .setTemperature(0.7f) + .setDecoderStartTokenId(50258) + .build() + assertEquals(256L, config.maxNewTokens) + assertEquals(0.7f, config.temperature, 0.001f) + assertEquals(50258L, config.decoderStartTokenId) + } + + @Test + fun testConfigCustomValues() { + val config = AsrTranscribeConfig(maxNewTokens = 64, temperature = 0.5f, decoderStartTokenId = 1) + assertEquals(64L, config.maxNewTokens) + assertEquals(0.5f, config.temperature, 0.001f) + assertEquals(1L, config.decoderStartTokenId) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigZeroMaxNewTokensThrows() { + AsrTranscribeConfig(maxNewTokens = 0) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigNegativeMaxNewTokensThrows() { + AsrTranscribeConfig(maxNewTokens = -1) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigNegativeTemperatureThrows() { + AsrTranscribeConfig(temperature = -0.1f) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderZeroMaxNewTokensThrows() { + AsrTranscribeConfig.Builder().setMaxNewTokens(0).build() + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderNegativeTemperatureThrows() { + AsrTranscribeConfig.Builder().setTemperature(-1.0f).build() + } + + @Test + fun testConfigDataClassEquality() { + val a = AsrTranscribeConfig(maxNewTokens = 100, temperature = 0.5f, decoderStartTokenId = 42) + val b = AsrTranscribeConfig(maxNewTokens = 100, temperature = 0.5f, decoderStartTokenId = 42) + assertEquals(a, b) + assertEquals(a.hashCode(), b.hashCode()) + } + + // ─── Helpers ──────────────────────────────────────────────────────────────── + + @Throws(IOException::class) + private fun provisionModelFile(): File? { + val pteFile = File(getTestFilePath(MODEL_FILE_NAME)) + val stream = javaClass.getResourceAsStream(MODEL_FILE_NAME) ?: return null + stream.use { FileUtils.copyInputStreamToFile(it, pteFile) } + return pteFile + } + + @Throws(IOException::class) + private fun provisionTokenizerFile(): File? { + val tokenizerFile = File(getTestFilePath(TOKENIZER_FILE_NAME)) + val stream = javaClass.getResourceAsStream(TOKENIZER_FILE_NAME) ?: return null + stream.use { FileUtils.copyInputStreamToFile(it, tokenizerFile) } + return tokenizerFile + } + + private fun tryCreateAsrModule(): AsrModule? { + val modelFile = provisionModelFile() + val tokenizerFile = provisionTokenizerFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile) + return try { + AsrModule(modelFile!!.absolutePath, tokenizerFile!!.absolutePath) + } catch (_: RuntimeException) { + // nativeCreate may reject non-ASR models — skip lifecycle tests in that case + null + } + } + + companion object { + private const val MODEL_FILE_NAME = "/stories.pte" + private const val TOKENIZER_FILE_NAME = "/tokenizer.bin" + } +} diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt new file mode 100644 index 00000000000..a8d35b09de2 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt @@ -0,0 +1,291 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +package org.pytorch.executorch + +import androidx.test.ext.junit.runners.AndroidJUnit4 +import java.io.File +import java.io.IOException +import org.apache.commons.io.FileUtils +import org.junit.After +import org.junit.Assert.assertTrue +import org.junit.Assert.fail +import org.junit.Before +import org.junit.Test +import org.junit.runner.RunWith +import org.pytorch.executorch.TestFileUtils.getTestFilePath +import org.pytorch.executorch.extension.llm.LlmCallback +import org.pytorch.executorch.extension.llm.LlmModule +import org.pytorch.executorch.extension.llm.LlmModuleConfig + +/** + * Instrumentation tests for LlmModule's LoRA / dataFiles constructor paths. + * + * LoRA adapters are loaded at construction time via the `dataFiles` parameter or + * `LlmModuleConfig.dataPath`. These tests verify that: + * 1. The dataFiles constructor variants produce a functional module + * 2. LlmModuleConfig with dataPath integrates correctly + * 3. Invalid data file paths are handled gracefully + * 4. Empty vs null dataFiles behave identically to no-data constructors + * + * Uses TinyStories-110M; no LoRA adapter fixture is available so functional LoRA tests + * (output-changes-with-adapter) are not possible. + */ +@RunWith(AndroidJUnit4::class) +class LlmLoraInstrumentationTest { + + private var llmModule: LlmModule? = null + + @Before + @Throws(IOException::class) + fun setUp() { + val pteFile = File(getTestFilePath(MODEL_FILE_NAME)) + requireNotNull(javaClass.getResourceAsStream(MODEL_FILE_NAME)) { + "Test resource $MODEL_FILE_NAME not found; did android_test_setup.sh run?" + } + .use { FileUtils.copyInputStreamToFile(it, pteFile) } + + val tokenizerFile = File(getTestFilePath(TOKENIZER_FILE_NAME)) + requireNotNull(javaClass.getResourceAsStream(TOKENIZER_FILE_NAME)) { + "Test resource $TOKENIZER_FILE_NAME not found; did android_test_setup.sh run?" + } + .use { FileUtils.copyInputStreamToFile(it, tokenizerFile) } + } + + @After + fun tearDown() { + llmModule?.close() + llmModule = null + } + + // ─── dataFiles constructor variants ───────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testConstructorWithEmptyDataFilesList() { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + emptyList(), + ) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module with empty dataFiles should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testConstructorWithNullDataPath() { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + null as String?, + ) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module with null dataPath should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testConstructorWithDataFilesAndBosEos() { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + emptyList(), + 0, + 0, + ) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module with dataFiles+BOS/EOS should generate tokens", tokens.isNotEmpty()) + } + + // ─── LlmModuleConfig with dataPath ────────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testLlmModuleConfigNoDataPath() { + val config = + LlmModuleConfig.create() + .modulePath(getTestFilePath(MODEL_FILE_NAME)) + .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME)) + .temperature(0.0f) + .build() + llmModule = LlmModule(config) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module via config with no dataPath should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testLlmModuleConfigWithNullDataPath() { + val config = + LlmModuleConfig.create() + .modulePath(getTestFilePath(MODEL_FILE_NAME)) + .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME)) + .temperature(0.0f) + .dataPath(null) + .build() + llmModule = LlmModule(config) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module via config with null dataPath should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testLlmModuleConfigWithLoadMode() { + val config = + LlmModuleConfig.create() + .modulePath(getTestFilePath(MODEL_FILE_NAME)) + .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME)) + .temperature(0.0f) + .loadMode(LlmModuleConfig.LOAD_MODE_FILE) + .build() + llmModule = LlmModule(config) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module via config with LOAD_MODE_FILE should generate tokens", tokens.isNotEmpty()) + } + + // ─── Invalid data file paths ──────────────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testInvalidDataFilePathThrowsOnConstruction() { + try { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + listOf("/nonexistent/lora_weights.bin"), + ) + // dataFiles are passed to native initHybrid — invalid paths should cause + // construction to fail. If we reach here, the native layer didn't validate. + llmModule!!.close() + fail("Construction should have thrown for invalid data file path") + } catch (e: RuntimeException) { + assertTrue( + "Exception message should be non-empty", + e.message != null && e.message!!.isNotEmpty(), + ) + } + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testMultipleInvalidDataFilePathsThrowOnConstruction() { + try { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + listOf("/nonexistent/a.bin", "/nonexistent/b.bin"), + ) + llmModule!!.close() + fail("Construction should have thrown for invalid data file paths") + } catch (e: RuntimeException) { + assertTrue( + "Exception message should be non-empty", + e.message != null && e.message!!.isNotEmpty(), + ) + } + } + + // ─── Baseline equivalence ─────────────────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testEmptyDataFilesMatchesNoDataConstructor() { + val moduleNoData = + LlmModule(getTestFilePath(MODEL_FILE_NAME), getTestFilePath(TOKENIZER_FILE_NAME), 0.0f) + val moduleEmptyList = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + emptyList(), + ) + + try { + val tokensNoData = generateAndCollect(moduleNoData) + val tokensEmptyList = generateAndCollect(moduleEmptyList) + + assertTrue("Both constructors should produce tokens", tokensNoData.isNotEmpty()) + assertTrue("Both constructors should produce tokens", tokensEmptyList.isNotEmpty()) + } finally { + moduleNoData.close() + moduleEmptyList.close() + } + } + + // ─── LlmModuleConfig builder validation ───────────────────────────────────── + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderMissingModulePathThrows() { + LlmModuleConfig.create().tokenizerPath("/some/tokenizer.bin").build() + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderMissingTokenizerPathThrows() { + LlmModuleConfig.create().modulePath("/some/model.pte").build() + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderInvalidLoadModeThrows() { + LlmModuleConfig.create() + .modulePath("/some/model.pte") + .tokenizerPath("/some/tokenizer.bin") + .loadMode(99) + .build() + } + + @Test + fun testConfigBuilderAllLoadModes() { + val modes = + listOf( + LlmModuleConfig.LOAD_MODE_FILE, + LlmModuleConfig.LOAD_MODE_MMAP, + LlmModuleConfig.LOAD_MODE_MMAP_USE_MLOCK, + LlmModuleConfig.LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS, + ) + for (mode in modes) { + val config = + LlmModuleConfig.create() + .modulePath("/some/model.pte") + .tokenizerPath("/some/tokenizer.bin") + .loadMode(mode) + .build() + assertTrue("Config should accept load mode $mode", config.loadMode == mode) + } + } + + // ─── Helpers ──────────────────────────────────────────────────────────────── + + private fun generateAndCollect(module: LlmModule): List { + val collector = mutableListOf() + module.generate( + TEST_PROMPT, + SEQ_LEN, + object : LlmCallback { + override fun onResult(result: String) { + collector.add(result) + } + }, + ) + return collector + } + + companion object { + private const val MODEL_FILE_NAME = "/stories.pte" + private const val TOKENIZER_FILE_NAME = "/tokenizer.bin" + private const val TEST_PROMPT = "Once" + private const val SEQ_LEN = 16 + private const val MAX_TEST_TIMEOUT_MS = 120_000L + } +} diff --git a/shim b/shim index b295819bb0e..cf6a954aae4 160000 --- a/shim +++ b/shim @@ -1 +1 @@ -Subproject commit b295819bb0ec636b4e3359828e05476d2437650a +Subproject commit cf6a954aae4bee7b4515e13475878460115027d1 diff --git a/third-party/ao b/third-party/ao index 02105d46c61..01849b2b19c 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439 +Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715 diff --git a/third-party/pocketfft b/third-party/pocketfft index 81874074463..0fa0ef591e3 160000 --- a/third-party/pocketfft +++ b/third-party/pocketfft @@ -1 +1 @@ -Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7 +Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa From 4491310bdcbccf7513f6955b83b14be681b41830 Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Fri, 29 May 2026 01:09:51 -0700 Subject: [PATCH 6/7] Reset backends/cadence/aot/ to upstream (keep functions_vision.yaml) --- backends/cadence/aot/compiler_funcs.py | 30 +++ backends/cadence/aot/pass_utils.py | 17 ++ backends/cadence/aot/quantizer/BUCK | 15 ++ .../cadence/aot/quantizer/pattern_utils.py | 207 ++++++++++++++++++ backends/cadence/aot/quantizer/patterns.py | 18 +- backends/cadence/aot/quantizer/utils.py | 4 +- 6 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 backends/cadence/aot/quantizer/pattern_utils.py diff --git a/backends/cadence/aot/compiler_funcs.py b/backends/cadence/aot/compiler_funcs.py index 02dcde7fd39..cec3cb7d016 100644 --- a/backends/cadence/aot/compiler_funcs.py +++ b/backends/cadence/aot/compiler_funcs.py @@ -14,6 +14,7 @@ import torch from torch._inductor.decomposition import remove_decompositions from torch.fx import GraphModule +from torch.fx.passes.infra.pass_base import PassBase, PassResult from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e from torchao.quantization.pt2e.quantizer import Quantizer @@ -607,3 +608,32 @@ def sink_input_dequant_through_transparent_ops( graph_module.recompile() return modified + + +class QuantFusionPass(PassBase): + """ + Iterates patterns, finds anchor ops in the converted graph, and calls + pattern.fuse() to replace dq-op-q subgraphs with fused ops. + """ + + def __init__(self, patterns: Sequence[object]) -> None: + super().__init__() + self.patterns = patterns + + def call(self, graph_module: GraphModule) -> Optional[PassResult]: + changed = False + for pattern in self.patterns: + pattern_changed = False + for target in pattern.anchor_ops(): # pyre-ignore[16] + for node in graph_module.graph.find_nodes( + op="call_function", target=target + ): + result = pattern.fuse(graph_module, node) # pyre-ignore[16] + if result is not None: + changed = True + pattern_changed = True + if pattern_changed: + graph_module.graph.eliminate_dead_code() + if changed: + graph_module.recompile() + return PassResult(graph_module, changed) diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py index ab42ef43d56..091605e94ec 100644 --- a/backends/cadence/aot/pass_utils.py +++ b/backends/cadence/aot/pass_utils.py @@ -212,3 +212,20 @@ def nodes_not_adjacent_in_gm( def none_throws(x: Optional[PassResult]) -> PassResult: assert x is not None return x + + +def replace_with_op( + gm: torch.fx.GraphModule, + insert_after: torch.fx.Node, + replacement_op: torch._ops.OpOverload, + args: tuple, # pyre-ignore[2] + kwargs: dict, # pyre-ignore[2] + node_to_replace: torch.fx.Node, +) -> torch.fx.Node: + """Insert ``replacement_op`` after ``insert_after`` and replace all uses of + ``node_to_replace`` with the new node.""" + with gm.graph.inserting_after(insert_after): + new_node = gm.graph.call_function(replacement_op, args, kwargs) + new_node.meta = node_to_replace.meta + node_to_replace.replace_all_uses_with(new_node) + return new_node diff --git a/backends/cadence/aot/quantizer/BUCK b/backends/cadence/aot/quantizer/BUCK index 34fec2556f8..c2ec3e3a1f6 100644 --- a/backends/cadence/aot/quantizer/BUCK +++ b/backends/cadence/aot/quantizer/BUCK @@ -14,6 +14,21 @@ fbcode_target(_kind = runtime.python_library, ], ) +fbcode_target(_kind = runtime.python_library, + name = "pattern_utils", + srcs = [ + "pattern_utils.py", + ], + typing = True, + deps = [ + ":utils", + "//caffe2:torch", + "//executorch/backends/cadence/aot:compiler_utils", + "//executorch/backends/cadence/aot:pass_utils", + "//executorch/backends/cadence/aot:utils", + ], +) + fbcode_target(_kind = runtime.python_library, name = "patterns", srcs = [ diff --git a/backends/cadence/aot/quantizer/pattern_utils.py b/backends/cadence/aot/quantizer/pattern_utils.py new file mode 100644 index 00000000000..25ff363ecc9 --- /dev/null +++ b/backends/cadence/aot/quantizer/pattern_utils.py @@ -0,0 +1,207 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import operator +from typing import Any + +import torch +from executorch.backends.cadence.aot.pass_utils import get_arg, replace_with_op +from executorch.backends.cadence.aot.quantizer.utils import ( + copy_node_metadata, + create_zero_bias_int32, + quantize_tensor_multiplier, +) +from executorch.backends.cadence.aot.utils import is_depthwise_conv +from torch import fx +from torch._ops import OpOverload + +DQ_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.dequantize_per_tensor.default +Q_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.quantize_per_tensor.default + + +def insert_node_with_meta( + gm: fx.GraphModule, + op: OpOverload, + args: tuple[Any, ...], + kwargs: dict[str, Any] | None, + insert_before: fx.Node, + like_node: fx.Node, +) -> fx.Node: + """Create a new node and populate its FakeTensor metadata. + + Inserts ``op(*args, **kwargs)`` before ``insert_before``, runs the op + under ``like_node``'s fake_mode to compute ``meta["val"]``, and copies + remaining metadata from ``like_node``. + """ + with gm.graph.inserting_before(insert_before): + node = gm.graph.call_function(op, args, kwargs or {}) + assert "val" in like_node.meta + fake_mode = like_node.meta["val"].fake_mode + assert fake_mode is not None + + def _resolve(x: Any) -> Any: + return x.meta["val"] if isinstance(x, fx.Node) else x + + fake_args = tuple(_resolve(a) for a in args) + fake_kwargs = {k: _resolve(v) for k, v in (kwargs or {}).items()} + with fake_mode: + node.meta["val"] = op(*fake_args, **fake_kwargs) + copy_node_metadata(node, like_node) + return node + + +def find_quant_user(node: fx.Node) -> fx.Node | None: + """Find the first quantize_per_tensor user of ``node``, traversing through getitem.""" + users = list(node.users) + if not users: + return None + user = users[0] + if user.target is operator.getitem: + if user.args[1] == 0: + users = list(user.users) + if not users: + return None + user = users[0] + else: + return None + if user.target == Q_PER_TENSOR: + return user + return None + + +def fuse_conv( + pattern: object, + gm: fx.GraphModule, + conv_node: fx.Node, + dq_input: fx.Node, + dq_weight: fx.Node, + quant_node: fx.Node, +) -> fx.Node: + """Fuse a dq->conv->q chain into a single quantized conv op.""" + dq_bias = None + if len(conv_node.args) > 2 and conv_node.args[2] is not None: + bias_arg = conv_node.args[2] + assert isinstance(bias_arg, fx.Node) + dq_bias = bias_arg if bias_arg.target == DQ_PER_TENSOR else None + weight_scale = get_arg(dq_weight, "scale", float) + input_scale = get_arg(dq_input, "scale", float) + bias_scale = input_scale * weight_scale + if dq_bias is not None: + bias_q = get_arg(dq_bias, "input", fx.Node) + else: + # Cadence quantized conv ops require a non-optional bias argument. + weight_node = get_arg(dq_weight, "input", fx.Node) + with gm.graph.inserting_before(conv_node): + bias_q = create_zero_bias_int32(gm, weight_node, bias_scale) + requantize_scale = bias_scale / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + args = ( + get_arg(dq_input, "input", fx.Node), + get_arg(dq_weight, "input", fx.Node), + bias_q, + ) + groups = get_arg(conv_node, "groups", int) + kwargs = { + "stride": get_arg(conv_node, "stride", list[int]), + "padding": get_arg(conv_node, "padding", list[int]), + "dilation": get_arg(conv_node, "dilation", list[int]), + "groups": groups, + "input_zero_point": get_arg(dq_input, "zero_point", int), + "weight_zero_point": get_arg(dq_weight, "zero_point", int), + "bias_scale": bias_scale, + "out_scale": get_arg(quant_node, "scale", float), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + } + replacement_op = pattern.replacement_op() # pyre-ignore[16] + if replacement_op == torch.ops.cadence.quantized_conv1d_ncl.per_tensor: + input_node = get_arg(dq_input, "input", fx.Node) + assert len(input_node.meta["val"].shape) >= 2 + in_channels = input_node.meta["val"].shape[1] + if is_depthwise_conv(groups, in_channels): + replacement_op = torch.ops.cadence.quantized_depthwise_conv1d_ncl.per_tensor + return replace_with_op(gm, conv_node, replacement_op, args, kwargs, quant_node) + + +def fuse_linear( + gm: fx.GraphModule, + dq_input: fx.Node, + dq_weight: fx.Node, + dq_bias: fx.Node | None, + quant_node: fx.Node, + op_node: fx.Node, + replacement_op: OpOverload, + weight_q: fx.Node | None = None, +) -> fx.Node: + """Fuse a dq->linear->q chain into a single quantized linear op.""" + assert op_node.target in ( + torch.ops.aten.linear.default, + torch.ops.aten.addmm.default, + ), f"Expected linear/addmm, got {op_node.target}" + weight_scale = get_arg(dq_weight, "scale", float) + input_scale = get_arg(dq_input, "scale", float) + bias_scale = input_scale * weight_scale + requantize_scale = bias_scale / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + if dq_bias is not None: + bias_q = get_arg(dq_bias, "input", fx.Node) + else: + # Cadence quantized linear ops require a non-optional bias argument. + weight_node = get_arg(dq_weight, "input", fx.Node) + with gm.graph.inserting_before(op_node): + bias_q = create_zero_bias_int32(gm, weight_node, bias_scale) + final_weight = ( + weight_q if weight_q is not None else get_arg(dq_weight, "input", fx.Node) + ) + args = (get_arg(dq_input, "input", fx.Node), final_weight, bias_q) + kwargs = { + "src_zero_point": get_arg(dq_input, "zero_point", int), + "weight_zero_point": get_arg(dq_weight, "zero_point", int), + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "offset": None, + } + return replace_with_op(gm, op_node, replacement_op, args, kwargs, quant_node) + + +def fuse_matmul( + gm: fx.GraphModule, + anchor_node: fx.Node, + dq0: fx.Node, + dq1: fx.Node, + quant_node: fx.Node, + replacement_op: OpOverload, +) -> fx.Node: + """Fuse a dq->matmul->q chain into a single quantized matmul op.""" + assert anchor_node.target in ( + torch.ops.aten.bmm.default, + torch.ops.aten.matmul.default, + ), f"Expected bmm/matmul, got {anchor_node.target}" + scale0 = get_arg(dq0, "scale", float) + scale1 = get_arg(dq1, "scale", float) + requantize_scale = (scale0 * scale1) / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + args = ( + get_arg(dq0, "input", fx.Node), + get_arg(dq0, "zero_point", int), + get_arg(dq1, "input", fx.Node), + get_arg(dq1, "zero_point", int), + None, + ) + kwargs = { + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "transposed": False, + } + return replace_with_op(gm, anchor_node, replacement_op, args, kwargs, quant_node) diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index 54c01227d07..e1f44b8ce5c 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -9,7 +9,7 @@ import operator from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import torch from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams @@ -79,6 +79,22 @@ def replacement_op(self) -> OpOverload: """ pass + def anchor_ops(self) -> tuple[OpOverload, ...]: + return tuple(self.partition_types()) + + def fuse( + self, + gm: fx.GraphModule, + anchor_node: fx.Node, + ) -> Optional[fx.Node]: + """Replace the dq→op→q subgraph around ``anchor_node`` with a fused op. + + Called by ``QuantFusionPass`` for each node matching ``anchor_ops()``. + Returns the new fused node on success, or ``None`` to skip this match. + Subclasses override to implement pattern-specific fusion logic. + """ + return None + class AddmmPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py index 51182a4ce92..f5773938f0a 100644 --- a/backends/cadence/aot/quantizer/utils.py +++ b/backends/cadence/aot/quantizer/utils.py @@ -118,7 +118,9 @@ def create_zero_bias_int32( bias_scale: float, ) -> fx.Node: """ - Creates a zero bias tensor with the shape of weight[0] + Creates a zero bias tensor with the shape of weight[0]. + Caller is responsible for setting the graph insertion point + (e.g. ``with gm.graph.inserting_before(node):``). """ try: attr_node = getattr(graph_module, weight_node.target) From f1693c2ab45835f2372bb3b056c0c7e72c78c751 Mon Sep 17 00:00:00 2001 From: Suraj Raut Date: Fri, 29 May 2026 01:20:56 -0700 Subject: [PATCH 7/7] Reset submodule pointers to upstream/main --- backends/xnnpack/third-party/XNNPACK | 2 +- backends/xnnpack/third-party/cpuinfo | 2 +- backends/xnnpack/third-party/pthreadpool | 2 +- extension/llm/tokenizers | 2 +- shim | 2 +- third-party/ao | 2 +- third-party/pocketfft | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index 3131afead79..1adaa7c709d 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit 3131afead790c5c69a9aa12273dfc40399789ad7 +Subproject commit 1adaa7c709d4839d29e1f219cb962b01c9e6a905 diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo index 8a9210069b5..f9a03241f8c 160000 --- a/backends/xnnpack/third-party/cpuinfo +++ b/backends/xnnpack/third-party/cpuinfo @@ -1 +1 @@ -Subproject commit 8a9210069b5a37dd89ed118a783945502a30a4ae +Subproject commit f9a03241f8c3d4ed0c9728f5d70bff873d43d4e0 diff --git a/backends/xnnpack/third-party/pthreadpool b/backends/xnnpack/third-party/pthreadpool index c2ba5c50bb5..a56dcd79c69 160000 --- a/backends/xnnpack/third-party/pthreadpool +++ b/backends/xnnpack/third-party/pthreadpool @@ -1 +1 @@ -Subproject commit c2ba5c50bb58d1397b693740cf75fad836a0d1bf +Subproject commit a56dcd79c699366e7ac6466792c3025883ff7704 diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index 3aada3fe28c..b642403834a 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit 3aada3fe28c945d14d5ec62254eb56ccdf10eb11 +Subproject commit b642403834a67c8ef14a7109dcd1bb5e5f3cb68a diff --git a/shim b/shim index cf6a954aae4..b295819bb0e 160000 --- a/shim +++ b/shim @@ -1 +1 @@ -Subproject commit cf6a954aae4bee7b4515e13475878460115027d1 +Subproject commit b295819bb0ec636b4e3359828e05476d2437650a diff --git a/third-party/ao b/third-party/ao index 01849b2b19c..02105d46c61 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715 +Subproject commit 02105d46c61dc80a8c9d39d5836e827ba3af8439 diff --git a/third-party/pocketfft b/third-party/pocketfft index 0fa0ef591e3..81874074463 160000 --- a/third-party/pocketfft +++ b/third-party/pocketfft @@ -1 +1 @@ -Subproject commit 0fa0ef591e38c2758e3184c6c23e497b9f732ffa +Subproject commit 8187407446316c3d16f15e5395dabd4b22f4fec7